输出: state year pop 0 Ohio 2000 1.5 1 Ohio 2001 1.7 2 Ohio 2002 3.6 3 Nevada 2001 2.4 4 Nevada 2002 2.9 RangeIndex(start=0, stop=5, step=1) Index([‘state’, ‘year’, ‘pop’], dtype=‘object’) one two three 0 Ohio 2000 1.5 1 Ohio 2001 1.7 a Ohio 2002 3.6 b Nevada 2001 2.4 3 Nevada 2002 2.9
输出: one two three 1 Ohio 2001.0 1.7 2 NaN NaN NaN one 0 Ohio 1 Ohio a Ohio b Nevada 3 Nevada one 1 Ohio 2 NaN one two 0 Ohio 2000 1 Ohio 2001 a Ohio 2002 b Nevada 2001 3 Nevada 2002 <class ‘pandas.core.series.Series’> 0 Ohio 1 Ohio a Ohio b Nevada 3 Nevada Name: one, dtype: object
输出: one two three 1 Ohio 2001 1.7 a Ohio 2002 3.6 two 0 2000 1 2001 a 2002 b 2001 3 2002 one three 1 Ohio 1.7 a Ohio 3.6 0 1.5 1 1.7 a 3.6 b 2.4 3 2.9 Name: three, dtype: float64
输出: 1 Ohio a Ohio Name: one, dtype: object 1 2001 a 2002 Name: two, dtype: int64
输出: state year pop 0 Ohio 2000 1.5 1 Ohio 2001 1.7 2 Ohio 2002 3.6 3 Nevada 2001 2.4 4 Nevada 2002 2.9 state year pop four 0 Ohio 2000 1.5 1 1 Ohio 2001 1.7 2 2 Ohio 2002 3.6 3 3 Nevada 2001 2.4 4 4 Nevada 2002 2.9 5 state year pop four 0 Ohio 2000 1.5 1 1 Ohio 2001 1.7 2 2 Ohio 2002 3.6 3 3 Nevada 2001 2.4 4 4 Nevada 2002 2.9 5 last 5 5 5.0 5 1 state year four 0 Ohio 2000 1 1 Ohio 2001 2 2 Ohio 2002 3 3 Nevada 2001 4 4 Nevada 2002 5 last 5 5 5 state year four 0 Ohio 2000 1 1 Ohio 2001 2 2 Ohio 2002 3 last 5 5 5 2 state city year 0 Ohio NaN 2000 1 Ohio NaN 2001 2 Ohio NaN 2002 last 5 NaN 5 state city year 0 Ohio 8 2000 1 Ohio 8 2001 2 Ohio 8 2002 last 5 8 5 state city year 0 Ohio 8.0 2000.0 charu NaN NaN NaN 1 Ohio 8.0 2001.0 2 Ohio 8.0 2002.0 last 5 8.0 5.0 state city year 0 Ohio 8.0 2000.0 charu 9 9.0 9.0 1 Ohio 8.0 2001.0 2 Ohio 8.0 2002.0 last 5 8.0 5.0
concat
""" pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True) """ #合并两个 import numpy as np df1 = pd.DataFrame(np.ones((4, 4))*1, columns=list('DCBA'), index=list('4321')) df2 = pd.DataFrame(np.ones((4, 4))*2, columns=list('FEDC'), index=list('6543')) df3 = pd.DataFrame(np.ones((4, 4))*2, columns=list('FEDC'), index=list('6543')) print(df1) print(df2) print(pd.concat([df1, df2])) print(pd.concat([df1, df2], axis=1)) #默认值:axis=0 #axis=0:竖方向(index)合并,合并方向index作列表相加(可重复),非合并方向columns取并集 #axis=1:横方向(columns)合并,合并方向columns作列表相加(可重复),非合并方向index取并集 #备注:原df中,取并集的行/列名称不能有重复项,即axis=0时columns不能有重复项,axis=1时index不能有重复项:对于参与和并单个矩阵而言 df1.columns = list('DDBA') print(df1) pd.concat([df1, df2], axis=0)#ValueError: Plan shapes are not aligned输出: D C B A 4 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 F E D C 6 2.0 2.0 2.0 2.0 5 2.0 2.0 2.0 2.0 4 2.0 2.0 2.0 2.0 3 2.0 2.0 2.0 2.0 A B C D E F 4 1.0 1.0 1.0 1.0 NaN NaN 3 1.0 1.0 1.0 1.0 NaN NaN 2 1.0 1.0 1.0 1.0 NaN NaN 1 1.0 1.0 1.0 1.0 NaN NaN 6 NaN NaN 2.0 2.0 2.0 2.0 5 NaN NaN 2.0 2.0 2.0 2.0 4 NaN NaN 2.0 2.0 2.0 2.0 3 NaN NaN 2.0 2.0 2.0 2.0 D C B A F E D C 1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 2 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 3 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 4 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 5 NaN NaN NaN NaN 2.0 2.0 2.0 2.0 6 NaN NaN NaN NaN 2.0 2.0 2.0 2.0 D D B A 4 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0
merge
""" merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None) 基于一列或多列,列名要在两个矩阵中都能找到,进行拼接 """ left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'],'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],'key2': ['K0', 'K0', 'K0', 'K0'],'C': ['C0', 'C1', 'C2', 'C3'],'D': ['D0', 'D1', 'D2', 'D3']}) print(left) print(right) #默认相同列名作为连接键,从第一个里面找 K0 K0在right中有,K0,K1在right中没有,K1 K0在right中有两个,K2 K1没有 """ key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 #因为AB只有一次K1 K0所以复制一次 """ print(pd.merge(left, right)) print(pd.merge(left, right, on=['key1', 'key2'])) #以右边矩阵为基础,没有的就NAN print(pd.merge(left, right, how='right', on=['key1', 'key2'])) #求并,保留所有行,没有的补充NAN result = pd.merge(left, right, how='outer', on=['key1', 'key2']) print(result)输出: key1 key2 A B 0 K0 K0 A0 B0 1 K0 K1 A1 B1 2 K1 K0 A2 B2 3 K2 K1 A3 B3 key1 key2 C D 0 K0 K0 C0 D0 1 K1 K0 C1 D1 2 K1 K0 C2 D2 3 K2 K0 C3 D3 key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 3 K2 K0 NaN NaN C3 D3 key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K0 K1 A1 B1 NaN NaN 2 K1 K0 A2 B2 C1 D1 3 K1 K0 A2 B2 C2 D2 4 K2 K1 A3 B3 NaN NaN 5 K2 K0 NaN NaN C3 D3
update
""" updata主要是数据更新,尽可能多的更新 """ df = pd.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}) print(df) new_df = pd.DataFrame({'B': [4, 5, 6, 7, 8], 'C': [7, 8, 9, 10, 11]})#根据列名和行名进行匹配 df.update(new_df) print(df) df = pd.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}) df.index= list("ABC") print(df.index) new_df = pd.DataFrame({'B': [4, 5, 6, 7, 8], 'C': [7, 8, 9, 10, 11]})#根据列名和行index进行匹配,而鄙视行名成 df.update(new_df) print(df) #使用series更新 new_column = pd.Series(['d', 'e'], name='B', index=list("AB")) print(new_column) df.update(new_column)#替换中如果新表中数据是NAN则不替换 print(df)A B 0 1 400 1 2 500 2 3 600 A B 0 1 4.0 1 2 5.0 2 3 6.0 Index([‘A’, ‘B’, ‘C’], dtype=‘object’) A B A 1 400 B 2 500 C 3 600 A d B e Name: B, dtype: object A B A 1 d B 2 e C 3 600
append
""" append """ df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) df2 = pd.DataFrame([[5, 6,7], [7, 8,9],[9,10,11]], columns=list('ABC')) print(df) print(df2) print(df.append(df2))#列为两个列的并 print(df.append(df2,ignore_index=True))输出: A B 0 1 2 1 3 4 A B C 0 5 6 7 1 7 8 9 2 9 10 11 A B C 0 1 2 NaN 1 3 4 NaN 0 5 6 7.0 1 7 8 9.0 2 9 10 11.0 A B C 0 1 2 NaN 1 3 4 NaN 2 5 6 7.0 3 7 8 9.0 4 9 10 11.0
