文章目录
一、pandas是什么?二、使用步骤1.引入库2.基本操作3.DateFrame性质4.应用-数值运算及统计分析5.利用pandas求相关性系数和协方差6.缺失值处理7.应用举例
总结
一、pandas是什么?
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。 Series:一维数组,与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近,其区别是:List中的元素可以是不同的数据类型,而Array和Series中则只允许存储相同的数据类型,这样可以更有效的使用内存,提高运算效率。 Time- Series:以时间为索引的Series。 DataFrame:二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。以下的内容主要以DataFrame为主。 Panel :三维的数组,可以理解为DataFrame的容器
二、使用步骤
1.引入库
代码如下(示例):
import numpy as np
import pandas as pd
import matplotlib
.pyplot as plt
import seaborn as sns
import warnings
warnings
.filterwarnings('ignore')
import ssl
ssl
._create_default_https_context
= ssl
._create_unverified_context
2.基本操作
代码如下(示例):
import pandas as pd
import numpy as np
from pandas import Series
data
=pd
.Series([1,5,3,4,5,6])
print(data
)
data
=pd
.Series([1,5,3,4],index
=["a","b","c","d"])
print(data
)
#用一维numpy数组创建
x
=np
.arange(5)
print(pd
.Series(x
))
#用字典创建 默认以键为index 值为data
population_dict
={"BeiJing":2154,
"ShangHai":2424,
"ShengZheng":1303,
"HangZhou":981}
population
=pd
.Series(population_dict
)
print(population
)
population
=pd
.Series(population_dict
,index
=["BeiJing","ShangHai","c"])#找不到的为NaN
print(population
)
#data为标量
print(pd
.Series(5,index
=[100,200,300]))#index对应值均为
5
Pandas DataFrame对象 带标签数据的多维数组:
#通用结构 pd
.DataFrame(data
,index
=index
,columns
=columns
)
#
1通过Series对象创建
import pandas as pd
import numpy as np
population_dict
={"BeiJing":2154,
"ShangHai":2424,
"ShengZheng":1303,
"HangZhou":981}
population
=pd
.Series(population_dict
)
print(pd
.DataFrame({"population":population
}))
#
2.通过serie对象字典创建
GDP_dict
={"BeiJing":30320,
"ShangHai":32680,
"ShengZheng":112303,
"HangZhou":12981}
GDP
=pd
.Series(GDP_dict
)
print(pd
.DataFrame({"population":population
,
"GDP":GDP
,
"country":"China"}))
#
3.通过字典列表对象创建
data
=[{"a":i
,"b":2*i
} for i in
range(3)]
print(data
)
print(pd
.DataFrame(data
))
data
=[{"a":1,"b":1},{"b":3,"c":1}]#不存在的填NAN
print(pd
.DataFrame(data
))
#
4.通过Numpy二维数组创建
data
=np
.random
.randint(10,size
=(3,2))
print(pd
.DataFrame(data
,columns
=["foo","bar"],index
=["a","b","c"]))
3.DateFrame性质
#DateFrame性质
import pandas as pd
import numpy as np
#
1.属性
population_dict
={"BeiJing":2154,
"ShangHai":2424,
"ShengZheng":1303,
"HangZhou":981}
population
=pd
.Series(population_dict
)
GDP_dict
={"BeiJing":30320,
"ShangHai":32680,
"ShengZheng":112303,
"HangZhou":12981}
GDP
=pd
.Series(GDP_dict
)
data
=pd
.DataFrame({"population":population
,
"GDP":GDP
,
"country":"China"})
print(data
)
print(data
.values
)#
1 df
.values 返回numpy数组表示的数据
print(data
.index
)#
2返回行索引
print(data
.columns
)#
3返回列索引
print(data
.shape
)#
4返回形状
4行
3列
print(data
.size
)#
5大小
12
print(data
.dtypes
)#
6返回每列数据类型
#
2索引
#获取列
print(data
["population"])#populat列
print(data
["GDP"])#字典式 获得GDP列
print(data
.GDP
)#对象属性式
#获取行
print(data
.loc
["BeiJing"])#绝对索引
print(data
.iloc
[0,2])#相对索引
#获取标量
print(data
.loc
["BeiJing","GDP"])
print(data
.iloc
[0,1])
print(data
.values
[0][1])
#Series对象的索引
print(data
.GDP
)
print(GDP
)
print(GDP
["BeiJing"])
#
3.切片
datas
=pd
.date_range(start
='2020-01-01',periods
=6)
print(datas
)
df
=pd
.DataFrame(np
.random
.randn(6,4),index
=datas
,columns
=["a","b","c","d"])
print(df
)
#行切片
print(df
["2020-01-01":"2020-01-03"])
print(df
.loc
["2020-01-01":"2020-01-03"])
print(df
.iloc
[0:3])
#列切片
print(df
.loc
[:,"a":"c"])
print(df
.iloc
[:,0:3])
#多种多样的取值
#行列同时切片
print(df
.loc
["2020-01-02":"2020-01-03","c":"d"])
print(df
.iloc
[1:3,2:])
#行切片 列分散取值
print(df
.loc
["2020-01-02":"2020-01-03",["c","d"]])
print(df
.iloc
[3:,[0,2]])
#均分散
print(df
.loc
[["2020-01-02","2020-01-03"],["c","d"]])
#
4.布尔索引
print(df
>0)
#isin()方法
df2
=df
.copy()
df2
['e']=['one','two','three','four','five','six']#加一列
print(df2
)
ind
=df2
['e'].isin(["two","four"])#e列
print(ind
)
#
5赋值
#增加新列
s1
=pd
.Series([1,2,3,4,5,6],index
=pd
.date_range('20200101',periods
=6))
print(s1
)
df
["e"]=s1
print(df
)
#修改赋值
df
.loc
['2020-01-01','a']=0
df
.iloc
[0,1]=0
df
["d"]=np
.array([5]*len(df
))
print(df
)
#修改index和columns
df
.index
=[i
for i in
range(df
.shape
[0])]#
range(len(df
))
print(df
)
df
.columns
=[i
for i in
range(df
.shape
[1])]
print(df
)
4.应用-数值运算及统计分析
#
1数据查看
dates
=pd
.date_range(start
='2019-01-01',periods
=6)
df
=pd
.DataFrame(np
.random
.randn(6,4),index
=dates
,columns
=["A","B","C","D"])
print(df
)
print(df
.head())#查看前面的行 默认
5行
print(df
.head(2))
print(df
.tail())#后
5行
df
.iloc
[0,3]=np
.nan
print(df
)
print(df
.info())#总体信息
#
2.Numpy通用函数同样适用于Pandas
x
=pd
.DataFrame(np
.arange(4).reshape(1,4))
print(x
)
y
=pd
.DataFrame(np
.arange(4,8).reshape(1,4))
print(y
)
print(x
*y
)
#矩阵化运算
np
.random
.seed(42)
x
=pd
.DataFrame(np
.random
.randint(10,size
=(30,30)))
print(x
)
print(x
.T
)
y
=pd
.DataFrame(np
.random
.randint(10,size
=(30,30)))
print(x
.dot(y
))
#pandas较numpy慢 pandas侧重数据处理 numpy侧重计算
#广播运算
print(x
/x
.iloc
[0])
#pandas特有特性
#索引对齐
A
=pd
.DataFrame(np
.random
.randint(0,20,size
=(2,2)),columns
=list("AB"))
B
=pd
.DataFrame(np
.random
.randint(0,10,size
=(3,3)),columns
=list("ABC"))
print(A
)
print(B
)
print(A
+B
)#自动对齐 没有的值用np
.nan表示
print(A
.add(B
,fill_value
=0))#
0 填充
#
2 统计相关
#数据种类统计
y
=np
.random
.randint(3,size
=20)
print(y
)
print(np
.unique(y
))
print(Counter(y
))
y1
=pd
.DataFrame(y
,columns
=["A"])
print(y1
)
print(np
.unique(y1
))
print(y1
["A"].value_counts())
import pandas as pd
import numpy as np
import timeit
from collections import Counter
#产生新的结果并进行排序
population_dict
={"BeiJing":2154,
"ShangHai":2424,
"ShengZheng":1303,
"HangZhou":981}
population
=pd
.Series(population_dict
)
GDP_dict
={"BeiJing":30320,
"ShangHai":32680,
"ShengZheng":112303,
"HangZhou":12981}
GDP
=pd
.Series(GDP_dict
)
city_info
=pd
.DataFrame({"population":population
,
"GDP":GDP
,
"country":"China"})
city_info
["per_GDP"]=city_info
["GDP"]/city_info
["population"]
print(city_info
)
#递增排序
print(city_info
.sort_values(by
="per_GDP"))
#递减
print(city_info
.sort_values(by
="per_GDP",ascending
=False
))
#按照轴排序
data
=pd
.DataFrame(np
.random
.randint(20,size
=(3,4)),index
=[2,1,0],columns
=["D","B","A","C"])
print(data
)
print(data
.sort_index())#行排序
print(data
.sort_index(axis
=1))
#统计方法
df
=pd
.DataFrame(np
.random
.normal(2,4,size
=(6,4)),columns
=list("ABCD"))
print(df
)
print(df
.count())#统计非空个数
print(df
.sum())
print(df
.sum(axis
=1))
#max min var std median
#mode 众数
print(data
.mode())
#一网打尽 中位数最大最小等等
print(df
.describe())
5.利用pandas求相关性系数和协方差
print(df
.corr())
print(df
.corrwith(df
["A"]))
#自定义输出
print(df
.apply(np
.cumsum
))#累加求和
print(df
.apply(np
.cumsum
,axis
=1))#行累加
print(df
.sum())#A与B和
print(df
.apply(lambda x
:x
.max()-x
.min()))
def
my_describe(x
):
return pd
.Series([x
.count(),x
.mean(),x
.max(),x
.idxmin(),x
.std()],index
=["Count","mean","max","idxmin","std"])
print(df
.apply(my_describe
))
6.缺失值处理
import pandas as pd
import numpy as np
import timeit
from collections import Counter
#
1.发现缺失值
data
=pd
.DataFrame(np
.array([[1,np
.nan
,2],
[np
.nan
,3,4],
[5,6,7]]),columns
=["A","B","C"])
print(data
)#NAN 字符串等数据类型变为object 比
int更消耗资源np
.nan是特殊浮点数
print(data
.dtypes
)
print(data
.isnull())
print(data
.notnull())
#
2 删除缺失值
print(data
.dropna())#删除有缺失值的整行
print(data
.dropna(axis
=1))#删除整列
data
["C"]=np
.nan
print(data
)
print(data
.dropna(axis
="columns",how
="all"))
data
.loc
[3]=np
.nan
print(data
)
print(data
.dropna(how
="all"))
#
3.填充缺失值
print(data
.fillna(value
=5))
#用均值进行替换
fill
=data
.stack().mean()
print(fill
)
print(data
.fillna(value
=fill
))
#合并数据
def
make_df(cols
, ind
):#一个简单的DataFrame
data
={c
:[str(c
)+str(i
) for i in ind
] for c in cols
}
return pd
.DataFrame(data
,ind
)
print(make_df("ABC",range(3)))
#垂直合并
df_1
=make_df("AB",[1,2])
df_2
=make_df("AB",[1,4])
print(df_1
)
print(df_2
)
print(pd
.concat([df_1
,df_2
],ignore_index
=True
))#行重叠
#水平合并
df_1
=make_df("CD",[1,2])
df_2
=make_df("AB",[1,2])
print(df_1
)
print(df_2
)
print(pd
.concat([df_2
,df_1
],axis
=1))
#对齐合并
merge()
df_9
=make_df("AB",[1,2])
df_10
=make_df("BC",[1,2])
print(df_9
)
print(df_10
)
print(pd
.merge(df_9
,df_10
))
7.应用举例
#合并城市信息
population_dict
={"city":("BeiJing","HangZhou","ShenZhen"),"pop":(2154,981,1303)}
population
=pd
.DataFrame(population_dict
)
print(population
)
GDP_dict
={"city":("BeiJing","ShangHai","ShenZhen"),"GDP":(30320,32680,13468)}
GDP
=pd
.DataFrame(GDP_dict
)
print(GDP
)
city_info
=pd
.merge(population
,GDP
)#交集
print(city_info
)
city_info
=pd
.merge(population
,GDP
,how
="outer")#全
print(city_info
)
#分组和数据透视表
df
=pd
.DataFrame({"key":["A","B","C","C","B","A"],"data1":range(6),"data2":np
.random
.randint(0,10,size
=6)})
print(df
)
#分组
print(df
.groupby("key"))#延迟计算
print(df
.groupby("key").sum())
print(df
.groupby("key").mean())
print(df
.groupby("key").var())
for i in df
.groupby("key"):
print(str(i
))
#按列取值
print(df
.groupby("key")["data2"].sum())
#按组迭代
for data
,group in df
.groupby("key"):
print("{0:5} shape={1}".format(data
,group
.shape
))
#调用方法
print(df
.groupby("key")["data1"].describe())
print(df
.groupby("key").aggregate(["min","median","max"]))
#过滤
def
filter_fun(x
):
return x
["data2"].std()>3
print(df
.groupby("key")["data2"].std())
print(df
.groupby("key").filter(filter_fun
))
#转换
print(df
.groupby("key").transform(lambda x
:x
-x
.mean()))
#行星
import seaborn as sns
planets
=sns
.load_dataset("planets")
print(planets
.shape
)
print(planets
.head())
print(planets
.describe())
decade
=10*(planets
["year"]
decade
=decade
.astype(str
)+"s"
decade
.name
="decade"
print(decade
.head())
print(planets
.groupby(["method",decade
]).sum())
print(planets
.groupby(["method",decade
])[["number"]].sum().unstack().fillna(0))
#泰坦尼克号乘客数据分析
titanic
=sns
.load_dataset("titanic")
print(titanic
.head())
print(titanic
.describe())
print(titanic
.groupby("sex")[["survived"]].mean())
print(titanic
.groupby("sex")["survived"].mean())
print(titanic
.groupby(["sex","class"])[["survived"]].aggregate("mean").unstack())
#数据透视表
print(titanic
.pivot_table("survived",index
="sex",columns
="class"))
print(titanic
.pivot_table("survived",index
="sex",columns
="class",aggfunc
="mean",margins
=True
))
print(titanic
.pivot_table(index
="sex",columns
="class",aggfunc
={"survived":"sum","fare":"mean"}))
#其它
1向量化字符串操作
2处理时间序列
3多级索引:用于多维数组
总结
以上就是今天要讲的内容,本文仅仅简单介绍了pandas的使用和简单应用,而pandas提供了大量能使我们快速便捷地处理数据的函数和方法。