Python之pandas基本操作基础

it2024-07-03  42

#为什么要学习pandas,numpy能够帮我们处理数值型的数据,但是这还不够 #而pandas能处理字符串,还有时间序列 import pandas as pd import numpy as np import string #创建一维Series的方法: # #1,默认索引为从0开始的数值 # a=pd.Series([1,2,3,4,5,6,]) # print (a) # print(type(a)) # #返回: # # 0 1 # # 1 2 # # 2 3 # # 3 4 # # 4 5 # # 5 6 # # dtype: int64 # # <class 'pandas.core.series.Series'> # #2.指定索引创建Series # a=pd.Series([1,2,3,4,5],index=list("abcde")) # print (a) # print(type(a)) # #返回 # # a 1 # # b 2 # # c 3 # # d 4 # # e 5 # # dtype: int64 # # <class 'pandas.core.series.Series'> # #3.用字典来创建Series,Series的键就是字典的键,Series的值就是字典的值 # a={"name":'python',"age":25,"tel":10000} # print(pd.Series(a)) # print(type(a)) # #返回: # # name python # # age 25 # # tel 10000 # # dtype: object # # <class 'dict'> # #另一种方法,通过for循环生成字典,索引用string.ascii_uppercase生成的大写字母 # a={string.ascii_uppercase[i]:i for i in range(10)} # print (a) # #{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9} # #将上面的字典生成一个Series # print(pd.Series(a)) # # A 0 # # B 1 # # C 2 # # D 3 # # E 4 # # F 5 # # G 6 # # H 7 # # I 8 # # J 9 # # dtype: int64 # #和上面的有所区别,指定索引从字母F~O,那么只有F~J能对应到字典的数值, # # 其它的没有数值,所以值为NaN,并且这时的dtype为float64了 # print(pd.Series(a,index=list(string.ascii_uppercase[5:15]))) # # F 5.0 # # G 6.0 # # H 7.0 # # I 8.0 # # J 9.0 # # K NaN # # L NaN # # M NaN # # N NaN # # O NaN # # dtype: float64 # #取Series的值,可以按键名取,也可以切片取 # a={"name":'python',"age":25,"tel":10000} # b=(pd.Series(a)) # print(b["name"]) # #返回: # #python #也可以按位置来切片 # print(b[0]) # #返回: # #python # #Series的两个函数index和values的用法 # print(b.index) # #返回: # # Index(['name', 'age', 'tel'], dtype='object') # print(b.values) # # 返回: # # ['python' 25 10000] # # #通过键值的boole判断取值 # a=pd.Series([1,2,3,4,5,6,]) # print(a[a>3]) # #返回 # # 3 4 # # 4 5 # # 5 6 # # dtype: int64 # #取外部数据,pandas取外部数据的方法非常方便,取csv的格式用read_csv, # #excel,json,html,sql都有对应的方法,下面以read_csv为例: # df=pd.read_csv("./pandas_data.csv") # print(df) # #输出结果,最左边被加了一个Index例,从0开始 # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 0 jg gf 12.00 # # 1 dd dd 66.00 # # 2 bb aat 55.00 # # 3 ww ww 0.30 # # 4 CR Cereal 0.85 # # .. ... ... ... # # 76 CEM CEM 1.00 # # 77 WMA WMAWMA 1.00 # # 78 jgIILLLLLL jgIILLLLLLJ 0.10 # # 79 2 adfa 2.00 # # 80 l5m l5m 12.00 # # # # [81 rows x 3 columns] # #用DataFrame构建二维series # a=pd.DataFrame(np.arange(12).reshape(3,4)) # print(a) # #返回结果: # #第一列是行索引,表明不同行,横向索引,叫index,0轴,axis=0 # #第一行是列索引,表明不同列,纵向索引,叫columns,1轴,axis=1 # # 0 1 2 3 # # 0 0 1 2 3 # # 1 4 5 6 7 # # 2 8 9 10 11 # #除了上面自动的column和index名,也可以指定名 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # print(a) # #返回结果 # # W X Y Z # # a 0 1 2 3 # # b 4 5 6 7 # # c 8 9 10 11 # #用字典来创建DataFrame # d1={"name":["xiaoming","xiaowang"],"aga":[19,20],"tel":["10000","10086"]} # t1=pd.DataFrame(d1) # print (t1) # #返回结果: # # name aga tel # # 0 xiaoming 19 10000 # # 1 xiaowang 20 10086 # # #另一种用字典创建DataFrame方法 # d2=[{"name":"xiaohong","age":23,"tel":10010},{"name":"xiaogang","tel":20010},{"name":"xiaowang","age":28}] # t2=pd.DataFrame(d2) # print (t2) # #返回结果: # #没有指定的位置是NaN # # name age tel # # 0 xiaohong 23.0 10010.0 # # 1 xiaogang NaN 20010.0 # # 2 xiaowang 28.0 NaN # #DataFrame的一些方法 # print(t2.index) # #RangeIndex(start=0, stop=3, step=1) # print(t2.columns) # #Index(['name', 'age', 'tel'], dtype='object') # print(t2.values) # # [['xiaohong' 23.0 10010.0] # # ['xiaogang' nan 20010.0] # # ['xiaowang' 28.0 nan]] # print(t2.shape) # #(3, 3) # print(t2.dtypes) # #name object # # age float64 # # tel float64 # # dtype: object # print(t2.ndim) #显示维度,结果为2说明是二维series # #2 # print(t2.head(2))#取前两行数据 # # name age tel # # 0 xiaohong 23.0 10010.0 # # 1 xiaogang NaN 20010.0 # print(t2.tail(2))#取后两行数据 # # name age tel # # 1 xiaogang NaN 20010.0 # # 2 xiaowang 28.0 NaN # print(t2.info())#取t2的信息概览,包括行数,列数,列非空值数,列类型,行类型,占用内存大小 # # <class 'pandas.core.frame.DataFrame'> # # RangeIndex: 3 entries, 0 to 2 # # Data columns (total 3 columns): # # # Column Non-Null Count Dtype # # --- ------ -------------- ----- # # 0 name 3 non-null object # # 1 age 2 non-null float64 # # 2 tel 2 non-null float64 # # dtypes: float64(2), object(1) # # memory usage: 200.0+ bytes # # None # print(t2.describe())#快速综合统计结果:计数,均值,标准差,最大值,最小值,四分位数 # # age tel # # count 2.000000 2.000000 # # mean 25.500000 15010.000000 # # std 3.535534 7071.067812 # # min 23.000000 10010.000000 # # 25% 24.250000 12510.000000 # # 50% 25.500000 15010.000000 # # 75% 26.750000 17510.000000 # # max 28.000000 20010.000000 # #DataFrame排序,sort_values的参数by是指排序指定的列,ascending默认为True(顺序) # df=pd.read_csv("./pandas_data.csv") # print(df.head(1)) # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 0 jg gf 12.0 # df=df.sort_values(by="GROSS_FACTOR",ascending=False) # print(df.head(5)) # #返回的值,是以GROSS_FACTOR为倒序排序的 # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 1 dd dd 66.0 # # 2 bb aat 55.0 # # 71 test3 test3 22.0 # # 64 tes yt 0219 V2 15.0 # # 44 yt_up yt <html> @0219_up 13.0 #DataFrame的切片操作 # df=pd.read_csv("./pandas_data.csv") # print(df.head(1)) # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 0 jg gf 12.0 # #方括号写数组,表示取行,对行进行操作 # print(df[:20]) #取df的前20行 # print(df[20:])#取第20行以后的数组 # #方括号写字符串,表示取列索引,对列进行操作 # print(df["GROSS_FACTOR"])#取其"GROSS_FACTOR"的列 # print(df[:20]["GROSS_FACTOR"])#取前20行的"GROSS_FACTOR"值 # #用loc切片 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # print(a) # # W X Y Z # # a 0 1 2 3 # # b 4 5 6 7 # # c 8 9 10 11 # print(a.loc["a"])#取a行数据 # # W 0 # # X 1 # # Y 2 # # Z 3 # #Name: a, dtype: int32 # print(a.loc["a","Z"])#取a行Z列数据 # #3 # print(a.loc[["a","c"],"Z"])#取a和c行的Z列 # # a 3 # # c 11 # # Name: Z, dtype: int32 # print(a.loc["a":"c","Z"]) # #这个结果与上面的是一致的,需要注意这里的结果是包括c行的,和其它的切片操作有区别,其它的切片操作都是包头不包尾 # # a 3 # # b 7 # # c 11 # # Name: Z, dtype: int32 # #用iloc切片,这i表示index,用索引的值来切片切片 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # print(a) # # W X Y Z # # a 0 1 2 3 # # b 4 5 6 7 # # c 8 9 10 11 # print(a.iloc[1])#取第2行的数据 # # W 4 # # X 5 # # Y 6 # # Z 7 # # Name: b, dtype: int32 # print(a.iloc[:,1])#取第2列 # # a 1 # # b 5 # # c 9 # # Name: X, dtype: int32 # print(a.iloc[1,1])#取第2行第2列 # #5 # print(a.iloc[1:,1])#取从第2行开始的第2列 # # b 5 # # c 9 # #赋值改变DataFrame的数值 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # print(a) # # W X Y Z # # a 0 1 2 3 # # b 4 5 6 7 # # c 8 9 10 11 # # a.iloc[1:,1]=100#将第2行开始的第2列值都改为100 # print (a) # # W X Y Z # # a 0 1 2 3 # # b 4 100 6 7 # # c 8 100 10 11 # a.iloc[1:,1]=np.nan#直接改成NaN也是可以的 # print (a) # # W X Y Z # # a 0 1.0 2 3 # # b 4 NaN 6 7 # # c 8 NaN 10 11 # df=pd.read_csv("./pandas_data.csv") # print(df.head(1)) # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 0 jg gf 12.0 # print(df["GROSS_FACTOR"])#通过指定columns条件查出所有的行 # # 0 12.00 # # 1 66.00 # # 2 55.00 # # 3 0.30 # # 4 0.85 # # ... # # 76 1.00 # # 77 1.00 # # 78 0.10 # # 79 2.00 # # 80 12.00 # print(df[df["GROSS_FACTOR"]>10])#再根据所有行找出>10的行 # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 0 jg gf 12.0 # # 1 dd dd 66.0 # # 2 bb aat 55.0 # # 44 yt_up yt <html> @0219_up 13.0 # # 47 yzh''test yaozihe test % 's 12.0 # # 48 test bus yzh test bus 12.0 # # 57 yzh''test yaozihe test % 's 12.0 # # 58 yzh tst yzh test 2 12.0 # # 61 123'13 yzh test 12.0 # # 64 tes yt 0219 V2 15.0 # # 71 test3 test3 22.0 # # 80 l5m l5m 12.0 # # #找出"GROSS_FACTOR">10并且<12的值 # #&是且,|是或 # print(df[(df["GROSS_FACTOR"]>10)&(df["GROSS_FACTOR"]<20)]) # # DIVISION_CODE DIVISION_NAME GROSS_FACTOR # # 0 jg gf 12.0 # # 44 yt_up yt <html> @0219_up 13.0 # # 47 yzh''test yaozihe test % 's 12.0 # # 48 test bus yzh test bus 12.0 # # 57 yzh''test yaozihe test % 's 12.0 # # 58 yzh tst yzh test 2 12.0 # # 61 123'13 yzh test 12.0 # # 64 tes yt 0219 V2 15.0 # # 80 l5m l5m 12.0 # #处理缺失数据NaN # #判断数据是否是NaN,用isnull方法 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN # print(a) # # W X Y Z # # a 0 1.0 2 3 # # b 4 NaN 6 7 # # c 8 NaN 10 11 # print(pd.isnull(a))#isnull方法判断是否为NaN # # W X Y Z # # a False False False False # # b False True False False # # c False True False False # print(pd.notnull(a))#另一个方法notnull,结果与isnull相反 # # W X Y Z # # a True True True True # # b True False True True # # c True False True True # print(pd.notnull(a["X"]))#只取”X"列的notnull,返回的结果为bool类型 # # a True # # b False # # c False # # Name: X, dtype: bool # print(a[pd.notnull(a["X"])]) # #接着将上面的内容再套一层,因为上面只有第一行结果返回为True,所以这里只返回第一行的数据 # #bool索引可以看成一个位置矩阵,然后把位置矩阵传给数组,True的取出来,False的不取 # #这里是按“W"列中是否有NaN得到的一个矩阵,只有第一列返回为True,然后由广播到数组里面,取出了数组第一行的数据 # # W X Y Z # # a 0 1.0 2 3 # #dropna的用法,可以将值为NaN的值删除 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN # print(a) # # W X Y Z # # a 0 1.0 2 3 # # b 4 NaN 6 7 # # c 8 NaN 10 11 # # print(a.dropna(axis=0))#axis=1表示删除行,所以删除矩阵里面有NaN的的所在行 # # W X Y Z # # a 0 1.0 2 3 # print(a.dropna(axis=1))#axis=1表示删除列,所以删除矩阵里面有NaN的的所在列 # # W Y Z # # a 0 2 3 # # b 4 6 7 # # c 8 10 11 # print(a.dropna(axis=0,how="all"))#how参数说明这一行所有的值都为NaN才删除,默认为"any“ # # W X Y Z # # a 0 1.0 2 3 # # b 4 NaN 6 7 # # c 8 NaN 10 11 # print(a.dropna(axis=0,how="any"))#how="any"是默认值,所以和上面不加此参数的结果一样 # # W X Y Z # # a 0 1.0 2 3 # #另一个参数inplace,此参数默认为False,为True时原地修改矩阵 # #相当于与结果重新赋值给自己 # a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ")) # a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN # print(a) # #a.dropna(axis=0,inplace=True) # a.dropna(axis=0,inplace=True) # #加了inplace参数,相于与a=a.dropna(axis=0) # print(a) # #返回的结果已改变了a的值 # # W X Y Z # # a 0 1.0 2 3 # #填充NaN数据 # d2=[{"name":"xiaohong","age":23,"tel":10010},{"name":"xiaogang","tel":20010},{"name":"xiaowang","age":28}] # t2=pd.DataFrame(d2) # print(t2) # # name age tel # # 0 xiaohong 23.0 10010.0 # # 1 xiaogang NaN 20010.0 # # 2 xiaowang 28.0 NaN # #1.通过fillna方法直接填充指定的值,把NaN的值都填充为100 # print(t2.fillna(100)) # # name age tel # # 0 xiaohong 23.0 10010.0 # # 1 xiaogang 100.0 20010.0 # # 2 xiaowang 28.0 100.0 # #2.填充平均数 # print(t2.fillna(t2.mean())) # # name age tel # # 0 xiaohong 23.0 10010.0 # # 1 xiaogang 25.5 20010.0 # # 2 xiaowang 28.0 15010.0 # t2["age"]=t2["age"].fillna(t2["age"].mean()) # #如果只想更改其中一列的NaN值为平均数 # print (t2) # # name age tel # # 0 xiaohong 23.0 10010.0 # # 1 xiaogang 25.5 20010.0 # # 2 xiaowang 28.0 NaN # print(t2["age"].mean()) # # 25.5 # #age列的平均值为25.5,这里和numpy的结果不同 # #numpy的矩阵,如果这一列或一行的数据只要有NaN的值,mean()的结果为Nan
最新回复(0)