pandas基础及机器学习中简单应用

it2024-06-18  48

文章目录

一、pandas是什么?二、使用步骤1.引入库2.基本操作3.DateFrame性质4.应用-数值运算及统计分析5.利用pandas求相关性系数和协方差6.缺失值处理7.应用举例 总结


一、pandas是什么?

示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。 Series:一维数组,与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近,其区别是:List中的元素可以是不同的数据类型,而Array和Series中则只允许存储相同的数据类型,这样可以更有效的使用内存,提高运算效率。 Time- Series:以时间为索引的Series。 DataFrame:二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。以下的内容主要以DataFrame为主。 Panel :三维的数组,可以理解为DataFrame的容器

二、使用步骤

1.引入库

代码如下(示例):

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') import ssl ssl._create_default_https_context = ssl._create_unverified_context

2.基本操作

代码如下(示例):

import pandas as pd import numpy as np from pandas import Series data=pd.Series([1,5,3,4,5,6]) print(data) data=pd.Series([1,5,3,4],index=["a","b","c","d"]) print(data) #用一维numpy数组创建 x=np.arange(5) print(pd.Series(x)) #用字典创建 默认以键为index 值为data population_dict={"BeiJing":2154, "ShangHai":2424, "ShengZheng":1303, "HangZhou":981} population=pd.Series(population_dict) print(population) population=pd.Series(population_dict,index=["BeiJing","ShangHai","c"])#找不到的为NaN print(population) #data为标量 print(pd.Series(5,index=[100,200,300]))#index对应值均为5

Pandas DataFrame对象 带标签数据的多维数组:

#通用结构 pd.DataFrame(data,index=index,columns=columns) #1通过Series对象创建 import pandas as pd import numpy as np population_dict={"BeiJing":2154, "ShangHai":2424, "ShengZheng":1303, "HangZhou":981} population=pd.Series(population_dict) print(pd.DataFrame({"population":population})) #2.通过serie对象字典创建 GDP_dict={"BeiJing":30320, "ShangHai":32680, "ShengZheng":112303, "HangZhou":12981} GDP=pd.Series(GDP_dict) print(pd.DataFrame({"population":population, "GDP":GDP, "country":"China"})) #3.通过字典列表对象创建 data=[{"a":i,"b":2*i} for i in range(3)] print(data) print(pd.DataFrame(data)) data=[{"a":1,"b":1},{"b":3,"c":1}]#不存在的填NAN print(pd.DataFrame(data)) #4.通过Numpy二维数组创建 data=np.random.randint(10,size=(3,2)) print(pd.DataFrame(data,columns=["foo","bar"],index=["a","b","c"]))

3.DateFrame性质

#DateFrame性质 import pandas as pd import numpy as np #1.属性 population_dict={"BeiJing":2154, "ShangHai":2424, "ShengZheng":1303, "HangZhou":981} population=pd.Series(population_dict) GDP_dict={"BeiJing":30320, "ShangHai":32680, "ShengZheng":112303, "HangZhou":12981} GDP=pd.Series(GDP_dict) data=pd.DataFrame({"population":population, "GDP":GDP, "country":"China"}) print(data) print(data.values)#1 df.values 返回numpy数组表示的数据 print(data.index)#2返回行索引 print(data.columns)#3返回列索引 print(data.shape)#4返回形状 43print(data.size)#5大小 12 print(data.dtypes)#6返回每列数据类型 #2索引 #获取列 print(data["population"])#populat列 print(data["GDP"])#字典式 获得GDP列 print(data.GDP)#对象属性式 #获取行 print(data.loc["BeiJing"])#绝对索引 print(data.iloc[0,2])#相对索引 #获取标量 print(data.loc["BeiJing","GDP"]) print(data.iloc[0,1]) print(data.values[0][1]) #Series对象的索引 print(data.GDP) print(GDP) print(GDP["BeiJing"]) #3.切片 datas=pd.date_range(start='2020-01-01',periods=6) print(datas) df=pd.DataFrame(np.random.randn(6,4),index=datas,columns=["a","b","c","d"]) print(df) #行切片 print(df["2020-01-01":"2020-01-03"]) print(df.loc["2020-01-01":"2020-01-03"]) print(df.iloc[0:3]) #列切片 print(df.loc[:,"a":"c"]) print(df.iloc[:,0:3]) #多种多样的取值 #行列同时切片 print(df.loc["2020-01-02":"2020-01-03","c":"d"]) print(df.iloc[1:3,2:]) #行切片 列分散取值 print(df.loc["2020-01-02":"2020-01-03",["c","d"]]) print(df.iloc[3:,[0,2]]) #均分散 print(df.loc[["2020-01-02","2020-01-03"],["c","d"]]) #4.布尔索引 print(df>0) #isin()方法 df2=df.copy() df2['e']=['one','two','three','four','five','six']#加一列 print(df2) ind=df2['e'].isin(["two","four"])#e列 print(ind) #5赋值 #增加新列 s1=pd.Series([1,2,3,4,5,6],index=pd.date_range('20200101',periods=6)) print(s1) df["e"]=s1 print(df) #修改赋值 df.loc['2020-01-01','a']=0 df.iloc[0,1]=0 df["d"]=np.array([5]*len(df)) print(df) #修改index和columns df.index=[i for i in range(df.shape[0])]#range(len(df)) print(df) df.columns=[i for i in range(df.shape[1])] print(df)

4.应用-数值运算及统计分析

#1数据查看 dates=pd.date_range(start='2019-01-01',periods=6) df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=["A","B","C","D"]) print(df) print(df.head())#查看前面的行 默认5print(df.head(2)) print(df.tail())#后5行 df.iloc[0,3]=np.nan print(df) print(df.info())#总体信息 #2.Numpy通用函数同样适用于Pandas x=pd.DataFrame(np.arange(4).reshape(1,4)) print(x) y=pd.DataFrame(np.arange(4,8).reshape(1,4)) print(y) print(x*y) #矩阵化运算 np.random.seed(42) x=pd.DataFrame(np.random.randint(10,size=(30,30))) print(x) print(x.T) y=pd.DataFrame(np.random.randint(10,size=(30,30))) print(x.dot(y)) #pandas较numpy慢 pandas侧重数据处理 numpy侧重计算 #广播运算 print(x/x.iloc[0]) #pandas特有特性 #索引对齐 A=pd.DataFrame(np.random.randint(0,20,size=(2,2)),columns=list("AB")) B=pd.DataFrame(np.random.randint(0,10,size=(3,3)),columns=list("ABC")) print(A) print(B) print(A+B)#自动对齐 没有的值用np.nan表示 print(A.add(B,fill_value=0))#0 填充 #2 统计相关 #数据种类统计 y=np.random.randint(3,size=20) print(y) print(np.unique(y)) print(Counter(y)) y1=pd.DataFrame(y,columns=["A"]) print(y1) print(np.unique(y1)) print(y1["A"].value_counts()) import pandas as pd import numpy as np import timeit from collections import Counter #产生新的结果并进行排序 population_dict={"BeiJing":2154, "ShangHai":2424, "ShengZheng":1303, "HangZhou":981} population=pd.Series(population_dict) GDP_dict={"BeiJing":30320, "ShangHai":32680, "ShengZheng":112303, "HangZhou":12981} GDP=pd.Series(GDP_dict) city_info=pd.DataFrame({"population":population, "GDP":GDP, "country":"China"}) city_info["per_GDP"]=city_info["GDP"]/city_info["population"] print(city_info) #递增排序 print(city_info.sort_values(by="per_GDP")) #递减 print(city_info.sort_values(by="per_GDP",ascending=False)) #按照轴排序 data=pd.DataFrame(np.random.randint(20,size=(3,4)),index=[2,1,0],columns=["D","B","A","C"]) print(data) print(data.sort_index())#行排序 print(data.sort_index(axis=1)) #统计方法 df=pd.DataFrame(np.random.normal(2,4,size=(6,4)),columns=list("ABCD")) print(df) print(df.count())#统计非空个数 print(df.sum()) print(df.sum(axis=1)) #max min var std median #mode 众数 print(data.mode()) #一网打尽 中位数最大最小等等 print(df.describe())

5.利用pandas求相关性系数和协方差

print(df.corr()) print(df.corrwith(df["A"])) #自定义输出 print(df.apply(np.cumsum))#累加求和 print(df.apply(np.cumsum,axis=1))#行累加 print(df.sum())#A与B和 print(df.apply(lambda x:x.max()-x.min())) def my_describe(x): return pd.Series([x.count(),x.mean(),x.max(),x.idxmin(),x.std()],index=["Count","mean","max","idxmin","std"]) print(df.apply(my_describe))

6.缺失值处理

import pandas as pd import numpy as np import timeit from collections import Counter #1.发现缺失值 data=pd.DataFrame(np.array([[1,np.nan,2], [np.nan,3,4], [5,6,7]]),columns=["A","B","C"]) print(data)#NAN 字符串等数据类型变为object 比int更消耗资源np.nan是特殊浮点数 print(data.dtypes) print(data.isnull()) print(data.notnull()) #2 删除缺失值 print(data.dropna())#删除有缺失值的整行 print(data.dropna(axis=1))#删除整列 data["C"]=np.nan print(data) print(data.dropna(axis="columns",how="all")) data.loc[3]=np.nan print(data) print(data.dropna(how="all")) #3.填充缺失值 print(data.fillna(value=5)) #用均值进行替换 fill=data.stack().mean() print(fill) print(data.fillna(value=fill)) #合并数据 def make_df(cols, ind):#一个简单的DataFrame data={c:[str(c)+str(i) for i in ind] for c in cols} return pd.DataFrame(data,ind) print(make_df("ABC",range(3))) #垂直合并 df_1=make_df("AB",[1,2]) df_2=make_df("AB",[1,4]) print(df_1) print(df_2) print(pd.concat([df_1,df_2],ignore_index=True))#行重叠 #水平合并 df_1=make_df("CD",[1,2]) df_2=make_df("AB",[1,2]) print(df_1) print(df_2) print(pd.concat([df_2,df_1],axis=1)) #对齐合并merge() df_9=make_df("AB",[1,2]) df_10=make_df("BC",[1,2]) print(df_9) print(df_10) print(pd.merge(df_9,df_10))

7.应用举例

#合并城市信息 population_dict={"city":("BeiJing","HangZhou","ShenZhen"),"pop":(2154,981,1303)} population=pd.DataFrame(population_dict) print(population) GDP_dict={"city":("BeiJing","ShangHai","ShenZhen"),"GDP":(30320,32680,13468)} GDP=pd.DataFrame(GDP_dict) print(GDP) city_info=pd.merge(population,GDP)#交集 print(city_info) city_info=pd.merge(population,GDP,how="outer")#全 print(city_info) #分组和数据透视表 df=pd.DataFrame({"key":["A","B","C","C","B","A"],"data1":range(6),"data2":np.random.randint(0,10,size=6)}) print(df) #分组 print(df.groupby("key"))#延迟计算 print(df.groupby("key").sum()) print(df.groupby("key").mean()) print(df.groupby("key").var()) for i in df.groupby("key"): print(str(i)) #按列取值 print(df.groupby("key")["data2"].sum()) #按组迭代 for data,group in df.groupby("key"): print("{0:5} shape={1}".format(data,group.shape)) #调用方法 print(df.groupby("key")["data1"].describe()) print(df.groupby("key").aggregate(["min","median","max"])) #过滤 def filter_fun(x): return x["data2"].std()>3 print(df.groupby("key")["data2"].std()) print(df.groupby("key").filter(filter_fun)) #转换 print(df.groupby("key").transform(lambda x:x-x.mean())) #行星 import seaborn as sns planets=sns.load_dataset("planets") print(planets.shape) print(planets.head()) print(planets.describe()) decade=10*(planets["year"]//10) decade=decade.astype(str)+"s" decade.name="decade" print(decade.head()) print(planets.groupby(["method",decade]).sum()) print(planets.groupby(["method",decade])[["number"]].sum().unstack().fillna(0)) #泰坦尼克号乘客数据分析 titanic=sns.load_dataset("titanic") print(titanic.head()) print(titanic.describe()) print(titanic.groupby("sex")[["survived"]].mean()) print(titanic.groupby("sex")["survived"].mean()) print(titanic.groupby(["sex","class"])[["survived"]].aggregate("mean").unstack()) #数据透视表 print(titanic.pivot_table("survived",index="sex",columns="class")) print(titanic.pivot_table("survived",index="sex",columns="class",aggfunc="mean",margins=True)) print(titanic.pivot_table(index="sex",columns="class",aggfunc={"survived":"sum","fare":"mean"})) #其它 1向量化字符串操作 2处理时间序列 3多级索引:用于多维数组

总结

以上就是今天要讲的内容,本文仅仅简单介绍了pandas的使用和简单应用,而pandas提供了大量能使我们快速便捷地处理数据的函数和方法。
最新回复(0)