数据分析
matplotlib模块
'''from matplotlib import pyplot as plt
x=range(0,20,2)
y=[12,1,3,5,6,56,62,13,35,66]
#设置图片大小,dpi设置清晰度
plt.figure(figsize=(20,10),dpi=80)
#绘图
plt.plot(x,y)
#保存
#plt.savefig("./1.jpg")
#设置x轴的刻度
plt.xticks(range(0,20))
plt.yticks(range(min(y),max(y)+1))
#展示图形
plt.show()'''
'''from matplotlib import pyplot as plt
import random
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
x=range(120)
y1=[random.randint(20,36) for i in range(120)]
y2=[random.randint(20,36) for i in range(120)]
plt.figure(figsize=(15,8),dpi=80)
#label添加折线图名称,color颜色,要跟上legend才能显示
plt.plot(x,y1,label="jackson",color="pink")
plt.plot(x,y2,label="杜",color="purple")
#图形注释
plt.legend()
#调整x轴的刻度
z=list(x)[::10]
labels=["hello {}".format(i) for i in z]
plt.xticks(z,labels,rotation=45)#rotation旋转度数
plt.xlabel("时间")
plt.ylabel("温度 单位(℃)")
plt.title("不同时间的温度变化")
#绘制网格
plt.grid()
plt.show()'''
'''#绘制温度散点图
from matplotlib import pyplot as plt
import random
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
y_3=[random.randint(12,20) for i in range(31)]
y_10=[random.randint(20,30) for i in range(31)]
x_3=range(1,32)
x_10=range(51,82)
#使用scatter绘制散点图
plt.scatter(x_3,y_3,label="三月份")
plt.scatter(x_10,y_10,label="十月份")
#添加图例
plt.legend(loc="upper left")
#调整x轴的刻度
_x=list(x_3)+list(x_10)
_xlabels=["三月{}日".format(i) for i in x_3]+["十月{}日".format(i-50) for i in x_10]
plt.xticks(_x[::3],_xlabels[::3],rotation=45)
#添加描述信息
plt.xlabel("时间")
plt.ylabel("温度")
plt.title("每天温度变化")
plt.show()'''
'''from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
a=["星际穿越","1917","误杀","急速车王","哪吒之魔童降世","巨齿鲨","大鱼海棠","大话西游之大圣娶亲","风声","美人鱼"]
b=[20,19,15,16,24,25,18,16,11,21]
plt.figure(figsize=(15,8),dpi=80)
#绘制条形图,barh表示横向
plt.barh(range(len(a)),b,height=0.3,color="pink")
plt.yticks(range(len(a)),a)
plt.show()'''
numpy模块
'''import numpy as np
import numpy.matlib as matlib
arr=np.array([[88,0,11,0,17,0,46,0,7,0,75,0],
[28,0,33,0,84,0,96,0,88,0,44,0],
[5,0,4,0,71,0,88,0,88,0,50,0],
[54,0,34,0,15,0,77,0,88,0,15,0],
[6,0,85,0,22,0,11,0,12,0,92,0]])
print(arr)
print('\n')
print(arr[1])#取一行
print('\n')
print(arr[:,0:2])#切片出前两列
print('\n')
print(arr[0:2,0:2])#取出前两行的前两列
print('\n')
print(arr[::-1,::-1])#行列反转
print('\n')
print(arr.reshape(60,))#多维数组变成一维数组
print('\n')
print(arr.reshape(12,-1))#-1自动计算行或者列
print('\n')
s1=np.array([[1,2,3],[4,5,6]])
s2=np.array([[1,3,5],[2,4,6]])
M=np.concatenate((s1,s2),axis=0)#1表示横向连接,0表示纵向链接
print(M)
print('\n')
s3=np.array([5.6,6.6,7.6])
print(np.around(s3,0))#对小数进行四舍五入
print('\n')
s4=np.random.randint(50,100,size=(3,5))
print(s4,'\n')
print(np.amin(s4,axis=0))#找出一列中的最小值
print(np.ptp(s4,axis=0))#最小值与最大值之间的差
print(np.median(s4,axis=0))#找出一组数据的中间值
print(np.std(s3),np.var(s3))#求标准差和方差
print('\n')
N=matlib.ones(shape=(2,3))
print(N)
print('\n')
k=matlib.eye(n=5,M=5,k=0)#n表示行数,M表示列数,k表示对角线索引
print(k,'\n')#转置为arr.T
print(matlib.identity(5))
print('\n')
a1=([[1,2],[3,4]])
b1=([[2,2],[5,1]])
print(np.dot(a1,b1))#矩阵相乘'''
import numpy
as np
a1
=np
.arange
(10)
print(a1
)
a2
=np
.array
([[1,2,3],[4,5,6]])
print(a2
.shape
)
print(a2
.reshape
(3,2))
print(a2
.flatten
())
print(a2
.transpose
())
print(a2
[[0,1],[1,2]])
a3
=np
.arange
(24).reshape
(4,6)
print(np
.where
(a3
<10,66,99))
a4
=np
.arange
(12).reshape
((3,4)).astype
("float")
a4
[1,2:]=np
.nan
print(a4
)
a5
= np
.array
([1, 2, 3])
a6
= np
.array
([4, 5, 6])
res1
= np
.vstack
((a5
, a6
))
print(res1
)
pandas模块
import pandas
as pd
import numpy
as np
from pandas
import Series
,DataFrame
a
=Series
(data
=[1,2,3,4,5,6])
print(a
)
dic
={
'a':1,
'b':2,
'c':3
}
b
=Series
(data
=dic
)
print(b
)
c
=Series
(data
=[1,2,3],index
=['数学','语文','英语'])
print(c
)
print(c
.head
(2))
print(c
.tail
(2))
d
=Series
(data
=[1,2,6,4,3,4,2,3,5,9,7,9])
print(d
.unique
())
s1
=Series
(data
=[1,2,3])
s2
=Series
(data
=[4,5,6])
print(s1
+s2
)
s3
=Series
(data
=[1,2,3,4,5,6],index
=['a','b','c','d','e','f'])
s4
=Series
(data
=(2,3,6),index
=['c','e','f'])
s
=s3
+s4
e
=s
[s
.notnull
()]
print(e
)
df
=DataFrame
(data
=np
.random
.randint
(0,100,size
=(5,6)))
print(df
)
dic
={
'name':['jackosn','jay','jaky'],
'salary':[20000,10000,15000]
}
df2
=DataFrame
(data
=dic
,index
=[1,2,3])
print(df2
)
f
=df2
.drop
(labels
='salary',axis
=1)
print(f
)
import numpy
as np
import pandas
as pd
from pandas
import DataFrame
'''df=DataFrame(np.random.randint(0,10,size=(5,6)))
#下面两种替换方式
df.replace(to_replace=0,value='zero')
print(df.replace(to_replace={0:'abc',5:13}))
#将指定列中的替换
df.iloc[2]=[0,0,0,0,0,0]
print(df.replace(to_replace={3:0},value=666))
dic={'name':['张三','李四','王五'],
'salary':[20000,15000,10000]}
s=DataFrame(data=dic)
print(s)
dic={
'张三':'tom',
'李四':'jay',
'王五':'jerry'
}
#映射关系表
print(s['name'].map(dic))
#计算税后收入,超过3000收50%的税
def after_sal(s):
return s-(s-3000)*0.5
s['after_sal']=s['salary'].map(after_sal)
print(s)
#apply操作:
#可以将行或列数据进行运算
def fun(s):
s=s.sum()
print(s)
s.apply(fun)'''
'''df=DataFrame({
'color':['red','purple','blue','green','pink'],
'value':np.random.randint(10,size=5)
})
print(df)
#映射df中的数据
new_index={0:'zero',1:'one',2:'two',3:'three',4:'four'}
#映射列索引
new_col={'color':'cc','value':'vv'}
print(df.rename(index=new_index,columns=new_col))'''
'''#排序实现的随机抽样
df=DataFrame(np.random.randint(100,size=(20,3)),columns=['A','B','C'])
print(df.take(np.random.permutation(3),axis=1).take(np.random.permutation(20),axis=0))
'''
'''df=DataFrame({'item':['a','b','c','b','c','a'],
'price':[1,6,8,5,11,3],
'color':['pink','purple','blue','blue','pink','yellow'],
'weight':[16,12,18,20,12,30]})
average=df.groupby(by='item').mean()['price']
dic=average.to_dict()
df['average']=df['item'].map(dic)
print(df)'''
data
=pd
.read_csv
('C:/Users/Administrator/Desktop/1.txt')
print(data
)
import pandas
as pd
import numpy
as np
dates
= pd
.date_range
('20130101', periods
=6)
df
= pd
.DataFrame
(np
.random
.randn
(6, 4), index
=dates
, columns
=list('ABCD'))
print(df
)
df2
= pd
.DataFrame
({'A': 1.,
'B': pd
.Timestamp
('20200406'),
'C': pd
.Series
(1, index
=list(range(4)), dtype
='float32'),
'D': np
.array
([3] * 4, dtype
='int32'),
'E': pd
.Categorical
(["test", "train", "test", "train"]),
'F': 'foo'})
print(df2
)
df3
= pd
.DataFrame
(np
.random
.randn
(10, 4))
print(df3
)
pieces
= [df3
[:3], df3
[3:7],df3
[7:]]
print(pieces
)
print(pd
.concat
(pieces
))
import pandas
as pd
left
= pd
.DataFrame
({'key': ['foo', 'foo'], 'lval': [1, 2]})
right
= pd
.DataFrame
({'key': ['foo', 'foo'], 'rval': [4, 5]})
M
=pd
.merge
(left
, right
, on
='key')
print(left
)
print(right
)
print(M
)
import pandas
as pd
import numpy
as np
df
= pd
.DataFrame
(np
.random
.randn
(8, 4), columns
=['A', 'B', 'C', 'D'])
s
= df
.iloc
[3]
df
.append
(s
, ignore_index
=True)
import pandas as pd
import numpy as np
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
#zip(*)表示取各个list的第x项,作为返回的二维数组的第x项中的一维数组中的元素
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
#MultiIndex表示多级索引,它是从Index继承过来的,其中多级标签用元组对象来表示
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
stacked = df.stack()
#stack() 方法把 DataFrame 列压缩至一层
print(stacked)