Python for Beginners
from datetime
import datetime
, timedelta
today
= datetime
.now
()
one_day
= timedelta
(days
=1)
yesterday
= today
- one_day
print('Yesterday was: ' + str(yesterday
))
one_week
= timedelta
(week
=1)
last_week
= today
- one_week
print('Last week was: ' + str(last_week
))
from datetime
import datetime
current_date
= datetime
.now
()
print('Day: ' + str(current_date
.day
))
print('Month: ' + str(current_date
.month
))
print('Year: ' + str(current_date
.year
))
print('Hour: ' + str(current_date
.hour
))
print('Minuter: ' + str(current_date
.minute
))
print('Second: ' + str(current_date
.second
))
from datetime
import datetime
birthday
= input('When is your birthday(dd/mm/yyyy)?')
birthday_date
= datetime
.strptime
(birthday
, '%d/%m/%Y')
print('Birthday: ' + str(birthday_date
))
'''
switch语法
switch(expression){
case value :
语句块
break;
case value
语句块
break;
...
default :
语句块
}
'''
switch
= {
0 : lambda x
: x
+ 2,
1 : lambda x
: x
** 2,
2 : lambda x
: abs(x
)
}
print('switch字典的输出结果为:{0}'.format(switch
[0](-34)))
数组与列表的区别
数组存储的值要有相同的类型,列表可存储任何类型的值,可以发现在使用中大部分时间都是用到列表,除非你开始学习机器学习,编写自己的模型,会更多的用到数组。
from array
import array
scores
= array
('d')
scores
.append
(97)
scores
.append
(98)
print(scores
)
print(scores
)
VSCode 编辑多选项可以使用 Ctrl + D
pip
install virtualenv
python -m venv
<目录名
>
virtualenv
<目录名
>
<目录名
>\Scripts\Activate.bat
<目录名
>\Scripts\Activate.ps1
../
<目录名
>/Scripts/activate
<目录名
>/bin/activate
<目录名
>\deactivate.bat
DATEBASE
= Sample_Connnection_String
from dotenv
import load_dotenv
import os
load_dotenv
()
database
= os
.getenv
('DATABASE')
print(database
)
def logger(func
):
def wrapper():
print('Logging execution')
fun
()
print('Done logging')
return wrapper
@logger
def sample():
print('-- Inside sample function')
sample
()
More Python for Beginners
def sorter(item
):
return item
['name']
presenters
= [
{'name': 'Susan', 'age': 50},
{'name': 'Christopher', 'age': 47}
]
presenters
.sort
(key
=sorter
)
print(presenters
)
presenters
= [
{'name': 'Susan', 'age': 50},
{'name': 'Christopher', 'age': 47}
]
presenters
.sort
(key
=lambda item
: item
['name'])
print(presenters
)
class Presenter():
def __init__(self
, name
):
self
.name
= name
@
property
def name(self
):
print('In the getter')
return self
.__name
@name
.setter
def name(self
, value
):
print('In the setter')
self
.__name
= value
presenter
= Presenter
('Chris')
presenter
.name
= 'Christopher'
print(presenter
.name
)
class Person:
def __init__(self
, name
):
self
.name
= name
def say_hello(self
):
print('Hello, ' + self
.name
)
class Student(Person
):
def __init__(self
, name
, school
):
super().__init__
(name
)
self
.school
= school
def sing_school_song(self
):
print('Ode to ' + self
.school
)
def say_hello(self
):
super().say_hello
()
print('I am rather tired')
def __str__(self
):
return f
'{self.name} attends {self.school}'
student
= Student
('Christopher', 'UVM')
student
.say_hello
()
student
.sing_school_song
()
print(student
)
isinstance(student
, Student
)
isinstance(student
, Person
)
issubclass(Student
, Person
)
class Loggable:
def __init__(self
):
self
.title
= ''
def log(self
):
print('Log message from ' + self
.title
)
class Connection:
def __init__(self
):
self
.server
= ''
def connect(self
):
print('Connecting to database on ' + self
.server
)
def framework(item
):
if isinstance(item
, Connection
):
item
.connect
()
if isinstance(item
, Loggable
):
item
.log
()
class SqlDatabase(Connection
, Loggable
):
def __init__(self
):
self
.title
= 'Sql Connection Demo'
self
.server
= 'Some_Server'
sql_connection
= SqlDatabase
()
framework
(sql_connection
)
from pathlib
import Path
cwd
= Path
.cwd
()
print('Current working directory:\n' + str(cwd
))
new_file
= Path
.joinpath
(cwd
, 'new_file.txt')
print('Full path:\n' + str(new_file
))
print('Does that file exist? ' + str(new_file
.exists
()))
parent
= cwd
.parent
print('Is this a directory? ' + str(parent
.is_dir
()))
print('Is this a file? ' + str(parent
.is_file
()))
print('\n----- directory contents -----')
for child
in parent
.iterdir
():
is child
.is_dir
():
print(child
)
demo_file
= Path
(Path
.joinpath
(cwd
, 'demo.txt'))
print('file name: ' + demo_file
.name
)
print('file suffix: ' + demo_file
.suffix
)
print('file folder: ' + demo_file
.parent
.name
)
print('file size: ' + str(demo_file
.stat
().st_size
))
stream
= open(file_name
, mode
, buffer_size
)
r
- 以只读方式打开文件。文件的指针将会放在文件的开头。这是默认模式。
w
- 打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
a
- 打开一个文件用于追加。如果该文件已存在,文件指针将会放在文件的结尾。不存在,创建新文件进行写入。
x
- 写入的文件必须是新文件,如果已存在则会引发错误。
+ - 可读写模式
t
- 文本方式,默认
b
- 二进制方式
stream
= open('demo.txt')
print(stream
.readable
())
print(stream
.read
(1))
print(stream
.readline
())
stream
.close
()
stream
= open('output.txt', 'wt')
print(str(stream
.writable
()))
stream
.write
('H')
stream
.writelines
(['ello', ' ', 'world'])
stream
.write
('\n')
stream
.close
()
stream
= open('output.txt', 'wt')
stream
.write
('demo!')
print(str(stream
.tell
()))
stream
.seek
(0)
stream
.write
('cool')
stream
.flush
()
stream
.close
()
try:
stream
= open('output.txt', 'wt')
stream
.write
('Lorem ipsum dolar')
finally:
stream
.close
()
with open('output.txt', 'wt') as stream
:
stream
.write
('Lorem ipsum dolar')
from timeit
import default_timer
import requests
def load_data(delay
):
print(f
'Starting {delay} second timer')
text
= requests
.get
(f
'https://httpbin.org/delay/{delay}').text
print(f
'Completed {delay} second timer')
return text
def run_demo():
start_time
= default_timer
()
two_data
= load_data
(2)
three_data
= load_data
(3)
elapsed_time
= default_timer
() - start_time
print(f
'The operation took {elapsed_time:.2} seconds')
def main():
run_demo
()
main
()
from timeit
import default_timer
import aiohttp
import asyncio
async def load_data(session
, delay
):
print(f
'Starting {delay} second timer')
async with session
.get
(f
'http://httpbin.org/delay/{delay}') as resp
:
text
= await resp
.text
()
print(f
'Completed {delay} second timer')
return text
async def main():
start_time
= default_timer
()
async with aiohttp
.ClientSession
() as session
:
two_task
= asyncio
.create_task
(load_data
(session
, 2))
three_task
= asyncio
.create_task
(load_data
(session
, 3))
await asyncio
.sleep
(1)
print('Doing other work')
two_result
= await two_task
three_result
= await three_task
elapsed_time
= default_timer
() - start_time
print(f
'The operation took {elapsed_time:.2} seconds')
asyncio
.run
(main
())
Even more Python for Beginners Data science tools
import pandas
as pd
airports
= pd
.Series
([
'Seattle-Tacoma',
'Dulles',
'London Heathrow',
'Schiphol',
'Changi',
'Pearson',
'Narita'
])
airports
[2]
for value
in airports
:
print(value
)
airports
= pd
.DataFrame
([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['London Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
])
airports
= pd
.DataFrame
([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['London Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
],
columns
= ['Name', 'City', 'Country']
)
import pandas
as pd
airports
= pd
.DataFrame
([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
],
columns
= ['Name', 'City', 'Country']
)
airports
.head
(3)
airports
.tail
(3)
airports
.shape
airports
.info
()
'''
返回的信息包括:
- 行数和索引值的范围
- 列数
- 每列的信息,是否为空值,数据类型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
Name 7 non-null object
City 7 non-null object
Country 7 non-null object
dtypes: object(3)
memory usage: 148.0+ bytes
'''
import pandas
as pd
airports
= pd
.DataFrame
([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['London Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
],
columns
= ['Name', 'City', 'Country']
)
airports
['City']
airports
[['Name', 'Country']]
airports
.iloc
[0,0]
airports
.iloc
[2,2]
airports
.iloc
[:,:]
airports
.iloc
[0:2,:]
airports
.iloc
[:,0:2]
airports
.iloc
[:,[0,2]]
airports
.loc
[:,['Name','Country']]
import pandas
as pd
airports_df
= pd
.read_csv
('Data/airports.csv')
airports_df
= pd
.read_csv
(
'Data/airportsInvalidRows.csv',
error_bad_lines
=False
)
airports_df
= pd
.read_csv
(
'Data/airportsNoHeaderRows.csv',
header
=None
)
airports_df
= pd
.read_csv
(
'Data/airportsNoHeaderRows.csv',
header
=None,
names
=['Name', 'City', 'Country']
)
airports_df
.to_csv
('Data/MyNewCSVFile.csv')
airports_df
.to_csv
(
'Data/MyNewCSVFileNoIndex.csv',
index
=False
)
DataFrameName
.drop
(columns
=['columnname'])
new_df
= delays_df
.drop
(columns
=['Actual_arr_time'])
delays_df
.drop
(columns
=['Actual_arr_time'], inplace
=True)
desc_df
= delays_df
.loc
[:, ['Origin_airport','Dest_airport']]
delays
.df
.info
()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 16 columns):
FL_DATE 300000 non-null object
OP_UNIQUE_CARRIER 300000 non-null object
TAIL_NUM 299660 non-null object
OP_CARRIER_FL_NUM 300000 non-null int64
ORIGIN 300000 non-null object
DEST 300000 non-null object
CRS_DEP_TIME 300000 non-null int64
DEP_TIME 296825 non-null float64
DEP_DELAY 296825 non-null float64
CRS_ARR_TIME 300000 non-null int64
ARR_TIME 296574 non-null float64
ARR_DELAY 295832 non-null float64
CRS_ELAPSED_TIME 300000 non-null int64
ACTUAL_ELAPSED_TIME 295832 non-null float64
AIR_TIME 295832 non-null float64
DISTANCE 300000 non-null int64
dtypes: float64(6), int64(5), object(5)
memory usage: 30.9+ MB
'''
TAIL_NUM、DEP_TIME、DEP_DELAY、ARR_DELAY、ACTUAL_ELAPSED_TIME和AIR_TIME有缺失值。
delay_no_nulls_df
= delays_df
.dropna
()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 295832 entries, 0 to 299999
Data columns (total 16 columns):
FL_DATE 295832 non-null object
OP_UNIQUE_CARRIER 295832 non-null object
TAIL_NUM 295832 non-null object
OP_CARRIER_FL_NUM 295832 non-null int64
ORIGIN 295832 non-null object
DEST 295832 non-null object
CRS_DEP_TIME 295832 non-null int64
DEP_TIME 295832 non-null float64
DEP_DELAY 295832 non-null float64
CRS_ARR_TIME 295832 non-null int64
ARR_TIME 295832 non-null float64
ARR_DELAY 295832 non-null float64
CRS_ELAPSED_TIME 295832 non-null int64
ACTUAL_ELAPSED_TIME 295832 non-null float64
AIR_TIME 295832 non-null float64
DISTANCE 295832 non-null int64
dtypes: float64(6), int64(5), object(5)
memory usage: 32.7+ MB
'''
delays_df
.dropna
(inplace
=True)
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 295832 entries, 0 to 299999
Data columns (total 16 columns):
FL_DATE 295832 non-null object
OP_UNIQUE_CARRIER 295832 non-null object
TAIL_NUM 295832 non-null object
OP_CARRIER_FL_NUM 295832 non-null int64
ORIGIN 295832 non-null object
DEST 295832 non-null object
CRS_DEP_TIME 295832 non-null int64
DEP_TIME 295832 non-null float64
DEP_DELAY 295832 non-null float64
CRS_ARR_TIME 295832 non-null int64
ARR_TIME 295832 non-null float64
ARR_DELAY 295832 non-null float64
CRS_ELAPSED_TIME 295832 non-null int64
ACTUAL_ELAPSED_TIME 295832 non-null float64
AIR_TIME 295832 non-null float64
DISTANCE 295832 non-null int64
dtypes: float64(6), int64(5), object(5)
memory usage: 32.7+ MB
'''
aipports_df
.duplicated
()
airports_df
.drop_duplicates
(inplace
=True)
import pandas
as pd
delays_df
= pd
.read_csv
('Data/Lots_of_flight_data.csv')
X
= delays_df
.loc
[:,['DISTANCE', 'CRS_ELAPSED_TIME']]
y
= delays_df
.loc
[:,['ARR_DELAY']]
'''
使用Scikit-learn train_test_split将30%的数据放在测试DataFrame中
另外的70%的行放入训练DataFrame中,用来训练我们的模型
注意:通过为random_state指定一个值,我们可以确保如果再次运行代码,相同的行将被移动到测试数据帧中。
这使得我们的结果是可重复的。
'''
from sklearn
.model_selection
import train_test_split
X_train
, X_test
, y_train
, y_test
= train_test_split
(
X
,
y
,
test_size
=0.3,
random_state
=42
)
import pandas
as pd
from sklearn
.model_selection
import train_test_split
delays_df
= pd
.read_csv
('Data/Lots_of_flight_data.csv')
delays_df
.dropna
(inplace
=True)
X
= delays_df
.loc
[:,['DISTANCE', 'CRS_ELAPSED_TIME']]
y
= delays_df
.loc
[:,['ARR_DELAY']]
X_train
, X_test
, y_train
, y_test
= train_test_split
(
X
,
y
,
test_size
=0.3,
random_state
=42
)
from sklearn
.linear_model
import LinearRegression
regressor
= LinearRegression
()
regressor
.fit
(X_train
, y_train
)
y_pred
= regressor
.predict
(X_test
)
'''
现在我们已经有了一个经过训练的模型
可以使用许多方法来检查模型的准确性
但所有这些指标都是基于科学计算
我们可以使用Scikit-learn和numpy完成大部分工作
均方误差(MSE)
MSE是模型在预测观察结果时执行的平均误差。MSE越低,模型越好。
MSE是实际观测值与模型预测值之间的平均平方差。
MSE = mean((actuals - predicteds)^2)
'''
from sklearn
import metrics
print('Mean Squared Error:', metrics
.mean_squared_error
(y_test
, y_pred
))
'''
均方根误差(RMSE)
RMSE是模型在预测观察结果时执行的平均误差。RMSE越低,模型越好。
从数学上讲,RMSE是均方误差的平方根
RMSE = sqrt(MSE)
我们可以使用包含大量数学函数的numpy库来计算MSE的平方根
'''
import numpy
as np
print('Root Mean Squared Error:', np
.sqrt
(metrics
.mean_squared_error
(y_test
, y_pred
)))
type(y_pred
)
type(y_test
)
import numpy
as np
airports_array
= np
.array
(['Pearson','Changi','Narita'])
print(airports_array
)
print(airports_array
[2])
airports_series
= pd
.Series
(['Pearson','Changi','Narita'])
print(airports_series
)
print(airports_series
[2])
airports_array
= np
.array
([
['YYZ','Pearson'],
['SIN','Changi'],
['NRT','Narita']])
print(airports_array
)
print(airports_array
[0,0])
airports_df
= pd
.DataFrame
([['YYZ','Pearson'],['SIN','Changi'],['NRT','Narita']])
print(airports_df
)
print(airports_df
.iloc
[0,0])
predicted_df
= pd
.DataFrame
(y_pred
)
predicted_df
.head
()