《利用Python进行数据分析》第10章 时间序列

it2023-03-03  76

第10章 时间序列

10.1 日期和时间数据类型及工具

import pandas as pd import numpy as np %matplotlib inline %matplotlib notebook import matplotlib.pyplot as plt from pandas import * from datetime import datetime from datetime import datetime now=datetime.now() now # datetime.datetime(2020, 10, 20, 14, 1, 54, 933015) now.year,now.month,now.day # (2020, 10, 20) delta = datetime(2011,1,7)-datetime(2008,6,24,8,15) delta # datetime.timedelta(days=926, seconds=56700) delta.days # 926 delta.seconds # 56700 from datetime import timedelta start = datetime(2011,1,7) start + timedelta(12) # datetime.datetime(2011, 1, 19, 0, 0)

字符串和datetime的相互转换

stamp = datetime(2011,1,3) str(stamp) # '2011-01-03 00:00:00' stamp.strftime('%Y-%m-%d') # '2011-01-03' value = '2011-01-03' datetime.strptime(value,'%Y-%m-%d') # datetime.datetime(2011, 1, 3, 0, 0) datestrs = ['7/6/2011','8/6/2011'] [datetime.strptime(x,'%m/%d/%Y') for x in datestrs] # [datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)] from dateutil.parser import parse parse('2011-01-03') # datetime.datetime(2011, 1, 3, 0, 0) parse('Jan 31, 1997 10:45 PM') # datetime.datetime(1997, 1, 31, 22, 45) # 日出现在月之前 parse('6/12/2011',dayfirst=True) # datetime.datetime(2011, 12, 6, 0, 0) datestrs = ['7/6/2011','8/6/2011'] pd.to_datetime(datestrs) # DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None) # 处理缺失值 idx = pd.to_datetime(datestrs+[None]) pd.isnull(idx) # array([False, False, True])

10.2 时间序列基础

from datetime import datetime from pandas import Series dates = [datetime(2011,1,2),datetime(2011,1,5),datetime(2011,1,7), datetime(2011,1,8),datetime(2011,1,10),datetime(2011,1,12)] ts = Series(np.random.randn(6),index=dates) ts #2011-01-02 -1.384879 #2011-01-05 0.199621 #2011-01-07 0.040222 #2011-01-08 -0.289395 #2011-01-10 -0.005374 #2011-01-12 0.552563 #dtype: float64 type(ts) # pandas.core.series.Series ts.index # DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08', '2011-01-10', '2011-01-12'],dtype='datetime64[ns]', freq=None) # 自动按日期对齐 ts + ts[::2] #2011-01-02 -2.769757 #2011-01-05 NaN #2011-01-07 0.080444 #2011-01-08 NaN #2011-01-10 -0.010748 #2011-01-12 NaN ts.index.dtype #dtype('<M8[ns]') stamp = ts.index[0] stamp #Timestamp('2011-01-02 00:00:00')

索引、选取、子集构造

stamp = ts.index[2] ts[stamp] #0.040221833942954636 ts['1/10/2011'] #-0.005373868915115878 ts['20110110'] #-0.005373868915115878 longer_ts = Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000)) longer_ts #2000-01-01 -0.580951 #2000-01-02 0.223109 #2000-01-03 -0.187437 #2000-01-04 -1.516331 #2000-01-05 0.956707 # ... #2002-09-22 -0.849338 #2002-09-23 -0.679533 #2002-09-24 -0.793814 #2002-09-25 -1.377390 #2002-09-26 -1.479914 #Freq: D, Length: 1000, dtype: float64 longer_ts['2001'] #2001-01-01 -2.528386 #2001-01-02 0.728009 #2001-01-03 -0.402413 #2001-01-04 -1.144869 #2001-01-05 -1.615072 # ... #2001-12-27 -0.124626 #2001-12-28 -0.643222 #2001-12-29 0.372283 #2001-12-30 0.935801 #2001-12-31 0.832507 #Freq: D, Length: 365, dtype: float64 longer_ts['2001-05'] #2001-05-01 2.308811 #2001-05-02 -0.948999 #略 # 对日期进行切片的方式只对规则Series有效 ts[datetime(2011,1,7):] #2011-01-07 0.040222 #2011-01-08 -0.289395 #2011-01-10 -0.005374 #2011-01-12 0.552563 #dtype: float64 ts['1/6/2011':'1/11/2011'] #2011-01-07 0.040222 #2011-01-08 -0.289395 #2011-01-10 -0.005374 #dtype: float64 ts.truncate(after='1/9/2011') #2011-01-02 -1.384879 #2011-01-05 0.199621 #2011-01-07 0.040222 #2011-01-08 -0.289395 #dtype: float64 dates = pd.date_range('1/1/2000',periods=100,freq='W-WED') long_df = DataFrame(np.random.randn(100,4),index = dates,columns=['Colorado','Texas','New York','Ohio']) long_df.loc['5-2000']

.

带有重复索引的时间序列

dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000', '1/2/2000','1/3/2000']) dup_ts = Series(np.arange(5),index=dates) dup_ts #2000-01-01 0 #2000-01-02 1 #2000-01-02 2 #2000-01-02 3 #2000-01-03 4 #dtype: int32 dup_ts.index.is_unique #False dup_ts['1/3/2000'] #4 dup_ts['1/2/2000'] #2000-01-02 1 #2000-01-02 2 #2000-01-02 3 #dtype: int32

10.3 日期的范围、频率以及移动

ts #2011-01-02 -1.384879 #2011-01-05 0.199621 #2011-01-07 0.040222 #2011-01-08 -0.289395 #2011-01-10 -0.005374 #2011-01-12 0.552563 #dtype: float64 # resample()需添加.asfreq()才可显示 ts.resample('D').asfreq() #2011-01-02 -1.384879 #2011-01-03 NaN #2011-01-04 NaN #2011-01-05 0.199621 #2011-01-06 NaN #2011-01-07 0.040222 #2011-01-08 -0.289395 #2011-01-09 NaN #2011-01-10 -0.005374 #2011-01-11 NaN #2011-01-12 0.552563 #Freq: D, dtype: float64

生成日期范围

index = pd.date_range('4/1/2012','6/1/2012') index # start pd.date_range(start='4/1/2012',periods=20) #DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04', # '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08', # '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12', # '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16', # '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'], # dtype='datetime64[ns]', freq='D') # end pd.date_range(end='6/1/2012',periods=20) #DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16', # '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20', # '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24', # '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28', # '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'], # dtype='datetime64[ns]', freq='D') # freq='BM' pd.date_range('1/1/2000','12/1/2000',freq='BM') #DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28', # '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31', # '2000-09-29', '2000-10-31', '2000-11-30'], # dtype='datetime64[ns]', freq='BM') # 时间信息 pd.date_range('5/2/2012 12:56:31', periods=5) #DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31', # '2012-05-04 12:56:31', '2012-05-05 12:56:31', # '2012-05-06 12:56:31'], # dtype='datetime64[ns]', freq='D') # 规范化的时间信息 pd.date_range('5/2/2012 12:56:31', periods=5, normalize=True) #DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05', # '2012-05-06'], # dtype='datetime64[ns]', freq='D')

频率和日期偏移量

from pandas.tseries.offsets import Hour,Minute hour = Hour() hour #<Hour> four_hours = Hour(4) four_hours #<4 * Hours> pd.date_range('1/1/2000','1/3/2000 23:59',freq='4h') #DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00', # '2000-01-01 08:00:00', '2000-01-01 12:00:00', # '2000-01-01 16:00:00', '2000-01-01 20:00:00', # '2000-01-02 00:00:00', '2000-01-02 04:00:00', # '2000-01-02 08:00:00', '2000-01-02 12:00:00', # '2000-01-02 16:00:00', '2000-01-02 20:00:00', # '2000-01-03 00:00:00', '2000-01-03 04:00:00', # '2000-01-03 08:00:00', '2000-01-03 12:00:00', # '2000-01-03 16:00:00', '2000-01-03 20:00:00'], # dtype='datetime64[ns]', freq='4H') # 偏移量对象可用加法连接 Hour(2) + Minute(30) #<150 * Minutes> pd.date_range('1/1/2000',periods=10,freq='1h30min') #DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00', # '2000-01-01 03:00:00', '2000-01-01 04:30:00', # '2000-01-01 06:00:00', '2000-01-01 07:30:00', # '2000-01-01 09:00:00', '2000-01-01 10:30:00', # '2000-01-01 12:00:00', '2000-01-01 13:30:00'], # dtype='datetime64[ns]', freq='90T')

WOM日期

rng = pd.date_range('1/1/2012','9/1/2012',freq='WOM-3FRI') rng #DatetimeIndex(['2012-01-20', '2012-02-17', '2012-03-16', '2012-04-20', # '2012-05-18', '2012-06-15', '2012-07-20', '2012-08-17'], # dtype='datetime64[ns]', freq='WOM-3FRI') list(rng) #[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'), # Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')] # 移动(超前和滞后)数据 ts = Series(np.random.randn(4),index=pd.date_range('1/1/2000',periods=4,freq='M')) ts #2000-01-31 0.395257 #2000-02-29 -0.354256 #2000-03-31 -0.086645 #2000-04-30 1.718306 #Freq: M, dtype: float64 ts.shift(2) #2000-01-31 NaN #2000-02-29 NaN #2000-03-31 0.395257 #2000-04-30 -0.354256 #Freq: M, dtype: float64 ts.shift(-2) #2000-01-31 -0.086645 #2000-02-29 1.718306 #2000-03-31 NaN #2000-04-30 NaN #Freq: M, dtype: float64 # 百分比变化 ts/ts.shift(1)-1 #2000-01-31 NaN #2000-02-29 -1.896268 #2000-03-31 -0.755417 #2000-04-30 -20.831574 #Freq: M, dtype: float64 ts.shift(2,freq='M') #2000-03-31 0.395257 #2000-04-30 -0.354256 #2000-05-31 -0.086645 #2000-06-30 1.718306 #Freq: M, dtype: float64 ts.shift(3,freq='D') #2000-02-03 0.395257 #2000-03-03 -0.354256 #2000-04-03 -0.086645 #2000-05-03 1.718306 #dtype: float64 ts.shift(1,freq='3D') #2000-02-03 0.395257 #2000-03-03 -0.354256 #2000-04-03 -0.086645 #2000-05-03 1.718306 #dtype: float64 ts.shift(1,freq='90T') #2000-01-31 01:30:00 0.395257 #2000-02-29 01:30:00 -0.354256 #2000-03-31 01:30:00 -0.086645 #2000-04-30 01:30:00 1.718306 #Freq: M, dtype: float64

通过偏移量对日期进行位移

from pandas.tseries.offsets import Day,MonthEnd now = datetime(2011,11,17) now + 3*Day() #Timestamp('2011-11-20 00:00:00') now + MonthEnd() #Timestamp('2011-11-30 00:00:00') now + MonthEnd(2) #Timestamp('2011-12-31 00:00:00') # rollforward, rollback offset = MonthEnd() offset.rollforward(now) #Timestamp('2011-11-30 00:00:00') offset.rollback(now) #Timestamp('2011-10-31 00:00:00') # 结合groupby ts = Series(np.random.randn(20),index=pd.date_range('1/15/2000',periods=20,freq='4d')) ts.groupby(offset.rollforward).mean() #2000-01-31 0.246598 #2000-02-29 -0.040072 #2000-03-31 -0.180165 #dtype: float64 # resample ts.resample('M').mean() #2000-01-31 0.246598 #2000-02-29 -0.040072 #2000-03-31 -0.180165 #Freq: M, dtype: float64

10.4 时区处理

import pytz # 查看时区名 pytz.common_timezones[-5:] #['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC'] tz = pytz.timezone('US/Eastern') tz #<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

本地化和转换

rng = pd.date_range('3/9/2012 9:30',periods=6,freq='D') ts = Series(np.random.randn(len(rng)),index=rng) ts #2012-03-09 09:30:00 -1.072802 #2012-03-10 09:30:00 -1.197278 #2012-03-11 09:30:00 0.267459 #2012-03-12 09:30:00 0.934976 #2012-03-13 09:30:00 -0.250807 #2012-03-14 09:30:00 0.515788 #Freq: D, dtype: float64 print(ts.index.tz) #None # tz pd.date_range('3/9/2012 09:30', periods=10, freq='D',tz='UTC') #DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00', # '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00', # '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00', # '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00', # '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'], # dtype='datetime64[ns, UTC]', freq='D') # 从单纯到本地化用tz_locolize方法 ts_utc = ts.tz_localize('UTC') ts_utc #2012-03-09 09:30:00+00:00 -1.072802 #2012-03-10 09:30:00+00:00 -1.197278 #2012-03-11 09:30:00+00:00 0.267459 #2012-03-12 09:30:00+00:00 0.934976 #2012-03-13 09:30:00+00:00 -0.250807 #2012-03-14 09:30:00+00:00 0.515788 #Freq: D, dtype: float64 # 被本地化后到某个特定时区,就可以tz_convert方法 ts_utc.tz_convert('US/Eastern') #2012-03-09 04:30:00-05:00 -1.072802 #2012-03-10 04:30:00-05:00 -1.197278 #2012-03-11 05:30:00-04:00 0.267459 #2012-03-12 05:30:00-04:00 0.934976 #2012-03-13 05:30:00-04:00 -0.250807 #2012-03-14 05:30:00-04:00 0.515788 #Freq: D, dtype: float64 # 先转化为eastern,然后转换为UTC ts_eastern = ts.tz_localize('US/Eastern') ts_eastern.tz_convert('UTC') #2012-03-09 14:30:00+00:00 -1.072802 #2012-03-10 14:30:00+00:00 -1.197278 #2012-03-11 13:30:00+00:00 0.267459 #2012-03-12 13:30:00+00:00 0.934976 #2012-03-13 13:30:00+00:00 -0.250807 #2012-03-14 13:30:00+00:00 0.515788 #Freq: D, dtype: float64 ts_eastern.tz_convert('Europe/Berlin') #2012-03-09 15:30:00+01:00 -1.072802 #2012-03-10 15:30:00+01:00 -1.197278 #2012-03-11 14:30:00+01:00 0.267459 #2012-03-12 14:30:00+01:00 0.934976 #2012-03-13 14:30:00+01:00 -0.250807 #2012-03-14 14:30:00+01:00 0.515788 #Freq: D, dtype: float64 # tz_localize和tz_convert也是DatetimeIndex的实例方法 ts.index.tz_localize('Asia/Shanghai') #DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00', # '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00', # '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'], # dtype='datetime64[ns, Asia/Shanghai]', freq='D')

操作时区意识型Timestamp对象(Timestamp不知道有啥用)

stamp = pd.Timestamp('2011-03-12 04:00') stamp_utc = stamp.tz_localize('utc') stamp_utc #Timestamp('2011-03-12 04:00:00+0000', tz='UTC') stamp_utc.tz_convert('US/Eastern') #Timestamp('2011-03-11 23:00:00-0500', tz='US/Eastern') stamp_moscow = pd.Timestamp('2011-03-12 04:00',tz='Europe/Moscow') stamp_moscow #Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow') stamp_utc.value #1299902400000000000 stamp_utc.tz_convert('US/Eastern').value #1299902400000000000 from pandas.tseries.offsets import Hour stamp = pd.Timestamp('2012-03-12 01:30',tz='US/Eastern') stamp #Timestamp('2012-03-12 01:30:00-0400', tz='US/Eastern') stamp + Hour() #Timestamp('2012-03-12 02:30:00-0400', tz='US/Eastern')

不同时区之间的运算

# 如果两个时区的进行运算,就会以UTC储存 rng = pd.date_range('3/7/2012 9:30',periods=10,freq='B') ts = Series(np.random.randn(len(rng)),index=rng) ts #2012-03-07 09:30:00 0.653830 #2012-03-08 09:30:00 -0.407845 #2012-03-09 09:30:00 -0.441909 #2012-03-12 09:30:00 -1.445735 #2012-03-13 09:30:00 0.644754 #2012-03-14 09:30:00 1.409599 #2012-03-15 09:30:00 1.809272 #2012-03-16 09:30:00 1.103140 #2012-03-19 09:30:00 -2.898467 #2012-03-20 09:30:00 -1.114189 #Freq: B, dtype: float64 ts1 = ts[:7].tz_localize('Europe/London') ts2 = ts1[2:].tz_convert('Europe/Moscow') result = ts1 + ts2 result.index #DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00', # '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00', # '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00', # '2012-03-15 09:30:00+00:00'], # dtype='datetime64[ns, UTC]', freq='B')

10.5 日期及其算术运算

p = pd.Period(2007,freq='A-DEC') p #Period('2007', 'A-DEC') p+5 #Period('2012', 'A-DEC') p-2 #Period('2005', 'A-DEC') pd.Period('2014',freq='A-DEC')-p #<7 * YearEnds: month=12> rng = pd.period_range('1/1/2000','6/30/2000',freq='M') rng #PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M') Series(np.random.randn(6),index=rng) #2000-01 -0.118072 #2000-02 0.106372 #2000-03 -1.561449 #2000-04 -0.049358 #2000-05 0.346884 #2000-06 -1.028937 #Freq: M, dtype: float64 values = ['2001Q3','2002Q2','2003Q1'] index = pd.PeriodIndex(values,freq='Q-DEC') index #PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

日期的频率转换

# asfreq()转换成别的频率 p = pd.Period('2007',freq='A-DEC') p.asfreq('M',how='start') #Period('2007-01', 'M') p.asfreq('M',how='end') #Period('2007-12', 'M') p = pd.Period('2007',freq='A-JUN') p.asfreq('M','start') #Period('2006-07', 'M') p.asfreq('M','end') #Period('2007-06', 'M') # 将高频率转换为低频率时,超时期是由子时期所属的位置决定的。 p = pd.Period('2007-08','M') p.asfreq('A-JUN') #Period('2008', 'A-JUN') rng = pd.period_range('2006','2009',freq='A-DEC') ts = Series(np.random.randn(len(rng)),index=rng) ts #2006 0.040841 #2007 -0.975856 #2008 0.620745 #2009 -0.709267 #Freq: A-DEC, dtype: float64 ts.asfreq('M',how='start') #2006-01 0.040841 #2007-01 -0.975856 #2008-01 0.620745 #2009-01 -0.709267 #Freq: M, dtype: float64 ts.asfreq('B',how='end') #2006-12-29 0.040841 #2007-12-31 -0.975856 #2008-12-31 0.620745 #2009-12-31 -0.709267 #Freq: B, dtype: float64

按季度计算的日期频率

# 以一月结束的季度 p = pd.Period('2012Q4',freq='Q-JAN') p #Period('2012Q4', 'Q-JAN') p.asfreq('D','start') #Period('2011-11-01', 'D') p.asfreq('D','end') #Period('2012-01-31', 'D') # 该季度倒数第二个工作日下午4点 p4pm = (p.asfreq('B','e')-1).asfreq('T','s')+16*60 p4pm #Period('2012-01-30 16:00', 'T') p4pm.to_timestamp() #Timestamp('2012-01-30 16:00:00') rng = pd.period_range('2011Q3','2012Q4',freq='Q-JAN') ts = Series(np.arange(len(rng)),index=rng) ts #2011Q3 0 #2011Q4 1 #2012Q1 2 #2012Q2 3 #2012Q3 4 #2012Q4 5 #Freq: Q-JAN, dtype: int32 new_rng = (rng.asfreq('B','e')-1).asfreq('T','s')+16*60 ts.index = new_rng.to_timestamp() ts #2010-10-28 16:00:00 0 #2011-01-28 16:00:00 1 #2011-04-28 16:00:00 2 #2011-07-28 16:00:00 3 #2011-10-28 16:00:00 4 #2012-01-30 16:00:00 5 #dtype: int32

将Timestamp转换为Period

rng = pd.date_range('1/1/2000',periods=3,freq='M') ts = Series(np.random.randn(3),index=rng) pts = ts.to_period() ts #2000-01-31 -0.563680 #2000-02-29 1.817687 #2000-03-31 1.871528 #Freq: M, dtype: float64 pts #2000-01 -0.563680 #2000-02 1.817687 #2000-03 1.871528 #Freq: M, dtype: float64 rng = pd.date_range('1/29/2000',periods=6,freq='D') ts2 = Series(np.random.randn(6),index=rng) ts2.to_period('M') #2000-01 0.775353 #2000-01 0.063094 #2000-01 1.326944 #2000-02 1.279147 #2000-02 -0.055798 #2000-02 0.677052 #Freq: M, dtype: float64 pts = ts.to_period() pts #2000-01 -0.563680 #2000-02 1.817687 #2000-03 1.871528 #Freq: M, dtype: float64 pts.to_timestamp(how='end') #2000-01-31 23:59:59.999999999 -0.563680 #2000-02-29 23:59:59.999999999 1.817687 #2000-03-31 23:59:59.999999999 1.871528 #Freq: M, dtype: float64

通过数组创建PeriodIndex

data = pd.read_csv('data\macrodata\macrodata.csv') data.info() #<class 'pandas.core.frame.DataFrame'> #RangeIndex: 203 entries, 0 to 202 #Data columns (total 14 columns): # # Column Non-Null Count Dtype #--- ------ -------------- ----- # 0 year 203 non-null float64 # 1 quarter 203 non-null float64 # 2 realgdp 203 non-null float64 # 3 realcons 203 non-null float64 # 4 realinv 203 non-null float64 # 5 realgovt 203 non-null float64 # 6 realdpi 203 non-null float64 # 7 cpi 203 non-null float64 # 8 m1 203 non-null float64 # 9 tbilrate 203 non-null float64 # 10 unemp 203 non-null float64 # 11 pop 203 non-null float64 # 12 infl 203 non-null float64 # 13 realint 203 non-null float64 #dtypes: float64(14) #memory usage: 22.3 KB data.year #0 1959.0 #1 1959.0 #2 1959.0 #3 1959.0 #4 1960.0 # ... #198 2008.0 #199 2008.0 #200 2009.0 #201 2009.0 #202 2009.0 #Name: year, Length: 203, dtype: float64 data.quarter #0 1.0 #1 2.0 #2 3.0 #3 4.0 #4 1.0 # ... #198 3.0 #199 4.0 #200 1.0 #201 2.0 #202 3.0 #Name: quarter, Length: 203, dtype: float64 # 合成PeriodIndex index = pd.PeriodIndex(year=data.year,quarter=data.quarter,freq='Q-DEC') index #PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2', # '1960Q3', '1960Q4', '1961Q1', '1961Q2', # ... # '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3', # '2008Q4', '2009Q1', '2009Q2', '2009Q3'], # dtype='period[Q-DEC]', length=203, freq='Q-DEC') data.index = index data.infl #1959Q1 0.00 #1959Q2 2.34 #1959Q3 2.74 #1959Q4 0.27 #1960Q1 2.31 # ... #2008Q3 -3.16 #2008Q4 -8.79 #2009Q1 0.94 #2009Q2 3.37 #2009Q3 3.56 #Freq: Q-DEC, Name: infl, Length: 203, dtype: float64

10.6 重采样及频率转换

# resample方法是频率转换工作的主力函数 rng = pd.date_range('1/1/2000',periods=100,freq='D') ts = Series(np.random.randn(len(rng)),index=rng) ts.resample('M').mean() #2000-01-31 0.339656 #2000-02-29 -0.189941 #2000-03-31 -0.088845 #2000-04-30 -0.049463 #Freq: M, dtype: float64 ts.resample('M',kind='period').mean() #2000-01 0.339656 #2000-02 -0.189941 #2000-03 -0.088845 #2000-04 -0.049463 #Freq: M, dtype: float64

降采样

rng = pd.date_range('1/1/2000',periods=12,freq='T') ts = Series(np.arange(12),index=rng) ts #2000-01-01 00:00:00 0 #2000-01-01 00:01:00 1 #2000-01-01 00:02:00 2 #2000-01-01 00:03:00 3 #2000-01-01 00:04:00 4 #2000-01-01 00:05:00 5 #2000-01-01 00:06:00 6 #2000-01-01 00:07:00 7 #2000-01-01 00:08:00 8 #2000-01-01 00:09:00 9 #2000-01-01 00:10:00 10 #2000-01-01 00:11:00 11 #Freq: T, dtype: int32 ts.resample('5min').sum() #2000-01-01 00:00:00 10 #2000-01-01 00:05:00 35 #2000-01-01 00:10:00 21 #Freq: 5T, dtype: int32 ts.resample('5min',closed='left').sum() #2000-01-01 00:00:00 10 #2000-01-01 00:05:00 35 #2000-01-01 00:10:00 21 #Freq: 5T, dtype: int32 ts.resample('5min',closed='left',label='left').sum() #2000-01-01 00:00:00 10 #2000-01-01 00:05:00 35 #2000-01-01 00:10:00 21 #Freq: 5T, dtype: int32 ts.resample('5min',loffset='-1s').sum() #1999-12-31 23:59:59 10 #2000-01-01 00:04:59 35 #2000-01-01 00:09:59 21 #Freq: 5T, dtype: int32

OHLC重采样

#ts.resample('5min',how='ohlc') ts.resample('5min').ohlc()

通过groupby进行重采样

rng = pd.date_range('1/1/2000',periods=100,freq='D') ts = Series(np.arange(100), index=rng) ts.groupby(lambda x:x.month).mean() #1 15 #2 45 #3 75 #4 95 #dtype: int32 ts.groupby(lambda x:x.weekday).mean() #0 47.5 #1 48.5 #2 49.5 #3 50.5 #4 51.5 #5 49.0 #6 50.0 #dtype: float64

升采样和插值

frame = DataFrame(np.random.randn(2,4), index=pd.date_range('1/1/2000',periods=2,freq='W-WED'), columns=['Colorado','Texas','New York','Ohio']) frame

.

df_daily = frame.resample('D') df_daily.asfreq()

.

# 填充 #frame.resample('D',fill_method='ffill') frame.resample('D').ffill()

.

# 限定填充数量 #frame.resample('D',fill_method='ffill',limit=2) frame.resample('D').ffill(limit=2)

.

#frame.resample('W-THU',fill_method='ffill') frame.resample('W-THU').ffill()

.

通过时期进行重采样

frame = DataFrame(np.random.randn(24,4), index=pd.period_range('1-2000','12-2001',freq='M'), columns=['Colorado','Texas','New York','Ohio']) frame[:5]

.

#annual_frame = frame.resample('A-DEC',how='mean') annual_frame = frame.resample('A-DEC').mean() annual_frame

.

#annual_frame.resample('Q-DEC',fill_method='ffill') annual_frame.resample('Q-DEC').ffill()

.

#annual_frame.resample('Q-DEC',fill_method='ffill',convention='start') annual_frame.resample('Q-DEC',convention='start').ffill()

.

#annual_frame.resample('Q-MAR',fill_method='ffill') annual_frame.resample('Q-MAR').ffill()

10.7 时间序列绘图

close_px_all = pd.read_csv('data\stock_px\stock_px_2.csv',parse_dates=True,index_col=0) close_px = close_px_all[['AAPL','MSFT','XOM']] # close_px = close_px.resample('B',fill_method='ffill') close_px = close_px.resample('B').ffill() close_px.info() #<class 'pandas.core.frame.DataFrame'> #DatetimeIndex: 2292 entries, 2003-01-02 to 2011-10-14 #Freq: B #Data columns (total 3 columns): # # Column Non-Null Count Dtype #--- ------ -------------- ----- # 0 AAPL 2292 non-null float64 # 1 MSFT 2292 non-null float64 # 2 XOM 2292 non-null float64 #dtypes: float64(3) close_px['AAPL'].plot()

.

close_px.loc['2009'].plot()

.

close_px['AAPL'].loc['01-2011':'03-2011'].plot()

.

# 转化为季度型频率 #appl_q = close_px['AAPL'].resample('Q-DEC',fill_method='ffill') appl_q = close_px['AAPL'].resample('Q-DEC').ffill() appl_q.loc['2009':'2011'].plot()

.

10.8 移动窗口函数

# rolling_mean NG close_px.AAPL.plot() pd.Series.rolling(close_px.AAPL,window=250).mean().plot()

appl_std250 = pd.Series.rolling(close_px.AAPL, 250,min_periods=10).std() appl_std250[5:12] #2003-01-09 NaN #2003-01-10 NaN #2003-01-13 NaN #2003-01-14 NaN #2003-01-15 0.077496 #2003-01-16 0.074760 #2003-01-17 0.112368 #Freq: B, Name: AAPL, dtype: float64 # 拓展窗口平均expanding window mean expanding_mean = lambda x: pd.Series.rolling(x, len(x), min_periods=1) # 对所有列应用rolling_mean pd.DataFrame.rolling(close_px,60).mean().plot(logy=True)

.

指数加权函数

# 指数加权统计会赋予近期的观察值更大的权数 fig,axes = plt.subplots(nrows=2,ncols=1,sharex=True,sharey=True,figsize=(12,7)) aapl_px = close_px.AAPL['2005':'2009'] ma60 = aapl_px.rolling(60,min_periods=50).mean() ewma60 = aapl_px.ewm(span=60).mean() aapl_px.plot(style='k-',ax=axes[0]) ma60.plot(style='k--',ax=axes[0]) aapl_px.plot(style='k-',ax=axes[1]) ewma60.plot(style='k--',ax=axes[1]) axes[0].set_title('Simple MA') axes[1].set_title('Exponentially-weighted MA')

二元移动窗口函数

# 两个时间序列上执行 spx_px = close_px_all['SPX'] spx_rets = spx_px / spx_px.shift(1)-1 returns = close_px.pct_change() #corr = pd.DataFrame.rolling_corr(returns.AAPL,spx_rets,125,min_periods=100) corr = returns.AAPL.rolling(125,min_periods=100).corr(spx_rets) corr.plot()

10.9 性能和内存使用方面的注意事项(略)

最新回复(0)