微软Python视频教程知识点

it2025-08-01  3

Python for Beginners

# 计算昨天或上周的日期使用timedelta from datetime import datetime, timedelta today = datetime.now() one_day = timedelta(days=1) yesterday = today - one_day print('Yesterday was: ' + str(yesterday)) one_week = timedelta(week=1) last_week = today - one_week print('Last week was: ' + str(last_week)) # 将日期分段显示 from datetime import datetime current_date = datetime.now() print('Day: ' + str(current_date.day)) print('Month: ' + str(current_date.month)) print('Year: ' + str(current_date.year)) print('Hour: ' + str(current_date.hour)) print('Minuter: ' + str(current_date.minute)) print('Second: ' + str(current_date.second)) # 将字符串转换为日期 from datetime import datetime birthday = input('When is your birthday(dd/mm/yyyy)?') birthday_date = datetime.strptime(birthday, '%d/%m/%Y') print('Birthday: ' + str(birthday_date)) # 利用了字典和匿名函数的特性实现switch功能 ''' switch语法 switch(expression){ case value : 语句块 break; case value 语句块 break; ... default : 语句块 } ''' switch = { 0 : lambda x : x + 2, 1 : lambda x : x ** 2, 2 : lambda x : abs(x) } print('switch字典的输出结果为:{0}'.format(switch[0](-34)))

数组与列表的区别

数组存储的值要有相同的类型,列表可存储任何类型的值,可以发现在使用中大部分时间都是用到列表,除非你开始学习机器学习,编写自己的模型,会更多的用到数组。

# 使用数组 from array import array scores = array('d') # 'double'代表数字float型数组 scores.append(97) scores.append(98) print(scores) print(scores)

VSCode 编辑多选项可以使用 Ctrl + D

# 虚拟环境 # 安装虚拟环境 pip install virtualenv # 创建虚拟环境 # Windows python -m venv <目录名> # OSX/Linux virtualenv <目录名> # 激活虚拟环境 # cmd <目录名>\Scripts\Activate.bat # PowerShell <目录名>\Scripts\Activate.ps1 # bash shell ../<目录名>/Scripts/activate # OSX/Linux <目录名>/bin/activate # 关闭虚拟环境 <目录名>\deactivate.bat # 使用环境变量 # 永远不要将敏感信息写入代码 # pip install python-dotenv # .env文件 DATEBASE = Sample_Connnection_String # app.py from dotenv import load_dotenv import os load_dotenv() database = os.getenv('DATABASE') print(database) # 装饰器 def logger(func): def wrapper(): print('Logging execution') fun() print('Done logging') return wrapper @logger def sample(): print('-- Inside sample function') sample()

More Python for Beginners

# 指定排序项 def sorter(item): return item['name'] presenters = [ {'name': 'Susan', 'age': 50}, {'name': 'Christopher', 'age': 47} ] presenters.sort(key=sorter) print(presenters) # lambda函数 presenters = [ {'name': 'Susan', 'age': 50}, {'name': 'Christopher', 'age': 47} ] presenters.sort(key=lambda item: item['name']) print(presenters) # 类和构造函数 class Presenter(): def __init__(self, name): self.name = name @property def name(self): print('In the getter') return self.__name @name.setter def name(self, value): print('In the setter') self.__name = value presenter = Presenter('Chris') presenter.name = 'Christopher' print(presenter.name) # 类继承 class Person: def __init__(self, name): self.name = name def say_hello(self): print('Hello, ' + self.name) class Student(Person): def __init__(self, name, school): super().__init__(name) # 调用父级构造函数设置name self.school = school def sing_school_song(self): print('Ode to ' + self.school) def say_hello(self): # 子类中重写父类函数 不自动调用父类 super().say_hello() # 需要手动设置 执行父级say_hello() print('I am rather tired') def __str__(self): # 当你print一个对象的时候,触发__str__ return f'{self.name} attends {self.school}' student = Student('Christopher', 'UVM') student.say_hello() student.sing_school_song() print(student) # 触发__str__魔法方法 # isinstance可判断某个对象是否属于某个类,考虑继承关系 isinstance(student, Student) # True isinstance(student, Person) # True # issubclass判断子类 issubclass(Student, Person) # True # 多重继承 从多个类进行继承 # Java C# 不支持多重继承 因为它会让你很快的混乱 # 登陆 class Loggable: def __init__(self): self.title = '' def log(self): print('Log message from ' + self.title) # 数据库连接 class Connection: def __init__(self): self.server = '' def connect(self): print('Connecting to database on ' + self.server) # 创建框架 def framework(item): if isinstance(item, Connection): # 验证继承类 item.connect() if isinstance(item, Loggable): # 验证继承类 item.log() # 创建数据库类 class SqlDatabase(Connection, Loggable): def __init__(self): self.title = 'Sql Connection Demo' self.server = 'Some_Server' # 实例 sql_connection = SqlDatabase() framework(sql_connection) # 文件系统管理 # Python3.6+ from pathlib import Path # 当前目录 cwd = Path.cwd() print('Current working directory:\n' + str(cwd)) # 合成目录与文件名 new_file = Path.joinpath(cwd, 'new_file.txt') print('Full path:\n' + str(new_file)) # 文件是否存在 print('Does that file exist? ' + str(new_file.exists())) # 获取父级目录 parent = cwd.parent # 判断是否目录 print('Is this a directory? ' + str(parent.is_dir())) # 判断是否文件 print('Is this a file? ' + str(parent.is_file())) # 获取目录中文件列表 print('\n----- directory contents -----') for child in parent.iterdir(): is child.is_dir(): print(child) demo_file = Path(Path.joinpath(cwd, 'demo.txt')) # 获取文件名 print('file name: ' + demo_file.name) # 获取扩展名 print('file suffix: ' + demo_file.suffix) # 获取文件目录 print('file folder: ' + demo_file.parent.name) # 获取文件大小 print('file size: ' + str(demo_file.stat().st_size)) # 文件I/O # 打开文件 stream = open(file_name, mode, buffer_size) # Modes r - 以只读方式打开文件。文件的指针将会放在文件的开头。这是默认模式。 w - 打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。 a - 打开一个文件用于追加。如果该文件已存在,文件指针将会放在文件的结尾。不存在,创建新文件进行写入。 x - 写入的文件必须是新文件,如果已存在则会引发错误。 + - 可读写模式 t - 文本方式,默认 b - 二进制方式 stream = open('demo.txt') # 是否可读 print(stream.readable()) # 读取第一个字符 print(stream.read(1)) # 读取一行 print(stream.readline()) # 关闭文件流 stream.close() # 写文件 stream = open('output.txt', 'wt') # 是否可写 print(str(stream.writable())) # 写入一个字符 stream.write('H') # 写入多个字符串。高级,不用遍历即可写入 stream.writelines(['ello', ' ', 'world']) # 写入换行 stream.write('\n') # 关闭文件流,清除数据 stream.close() # 文件流操作 stream = open('output.txt', 'wt') #覆盖写入 # 写入字符串 stream.write('demo!') # 打印文件指针位置 print(str(stream.tell())) # 改变游标在文件流中的位置 stream.seek(0) # 前四个字符将覆盖写入 stream.write('cool') # 刷新缓冲区的,即将缓冲区中的数据立刻写入文件,同时清空缓冲区 stream.flush() stream.close() # 在try / finally中打开和写入文件 # 为避免出错,这种写法很重要 try: stream = open('output.txt', 'wt') stream.write('Lorem ipsum dolar') finally: stream.close() # 使用with语句进行简化 with open('output.txt', 'wt') as stream: stream.write('Lorem ipsum dolar') # 异步编程 # 同步操作 from timeit import default_timer #测试代码的执行时间 import requests def load_data(delay): print(f'Starting {delay} second timer') text = requests.get(f'https://httpbin.org/delay/{delay}').text print(f'Completed {delay} second timer') return text def run_demo(): start_time = default_timer() two_data = load_data(2) three_data = load_data(3) elapsed_time = default_timer() - start_time print(f'The operation took {elapsed_time:.2} seconds') def main(): run_demo() main() # 异步操作 from timeit import default_timer import aiohttp import asyncio # 异步http库 async def load_data(session, delay): print(f'Starting {delay} second timer') async with session.get(f'http://httpbin.org/delay/{delay}') as resp: text = await resp.text() print(f'Completed {delay} second timer') return text async def main(): # Start the timer start_time = default_timer() # Creating a single session async with aiohttp.ClientSession() as session: # Setup our tasks and get them running two_task = asyncio.create_task(load_data(session, 2)) three_task = asyncio.create_task(load_data(session, 3)) # Simulate other processing await asyncio.sleep(1) print('Doing other work') # Let's go get our values two_result = await two_task three_result = await three_task # Print our results elapsed_time = default_timer() - start_time print(f'The operation took {elapsed_time:.2} seconds') asyncio.run(main())

Even more Python for Beginners Data science tools

# Pandas import pandas as pd # Series是一个一维数组,类似于Python的列表 airports = pd.Series([ 'Seattle-Tacoma', 'Dulles', 'London Heathrow', 'Schiphol', 'Changi', 'Pearson', 'Narita' ]) # 使用索引引用一个Series的单个值 airports[2] # 循环遍历序列中的所有值 for value in airports: print(value) # DataFrame # 在处理Pandas时,大多数时候我们使用的是二维数组 # DataFrame可以存储二维数组 airports = pd.DataFrame([ ['Seatte-Tacoma', 'Seattle', 'USA'], ['Dulles', 'Washington', 'USA'], ['London Heathrow', 'London', 'United Kingdom'], ['Schiphol', 'Amsterdam', 'Netherlands'], ['Changi', 'Singapore', 'Singapore'], ['Pearson', 'Toronto', 'Canada'], ['Narita', 'Tokyo', 'Japan'] ]) # 使用columns参数指定列的名称 airports = pd.DataFrame([ ['Seatte-Tacoma', 'Seattle', 'USA'], ['Dulles', 'Washington', 'USA'], ['London Heathrow', 'London', 'United Kingdom'], ['Schiphol', 'Amsterdam', 'Netherlands'], ['Changi', 'Singapore', 'Singapore'], ['Pearson', 'Toronto', 'Canada'], ['Narita', 'Tokyo', 'Japan'] ], columns = ['Name', 'City', 'Country'] ) # DataFrame 验证操作 import pandas as pd airports = pd.DataFrame([ ['Seatte-Tacoma', 'Seattle', 'USA'], ['Dulles', 'Washington', 'USA'], ['Heathrow', 'London', 'United Kingdom'], ['Schiphol', 'Amsterdam', 'Netherlands'], ['Changi', 'Singapore', 'Singapore'], ['Pearson', 'Toronto', 'Canada'], ['Narita', 'Tokyo', 'Japan'] ], columns = ['Name', 'City', 'Country'] ) # 返回前3行 airports.head(3) # 返回最后3行 airports.tail(3) # 返回行数和列数 airports.shape # 返回更详细的信息 airports.info() ''' 返回的信息包括: - 行数和索引值的范围 - 列数 - 每列的信息,是否为空值,数据类型 <class 'pandas.core.frame.DataFrame'> RangeIndex: 7 entries, 0 to 6 Data columns (total 3 columns): Name 7 non-null object City 7 non-null object Country 7 non-null object dtypes: object(3) memory usage: 148.0+ bytes ''' # DataFrame 查询操作 import pandas as pd airports = pd.DataFrame([ ['Seatte-Tacoma', 'Seattle', 'USA'], ['Dulles', 'Washington', 'USA'], ['London Heathrow', 'London', 'United Kingdom'], ['Schiphol', 'Amsterdam', 'Netherlands'], ['Changi', 'Singapore', 'Singapore'], ['Pearson', 'Toronto', 'Canada'], ['Narita', 'Tokyo', 'Japan'] ], columns = ['Name', 'City', 'Country'] ) # 返回指定列 airports['City'] # 返回多列 airports[['Name', 'Country']] # 列名放入列表中 # 根据位置获取值 airports.iloc[0,0] # 'Seatte-Tacoma' airports.iloc[2,2] # 'United Kingdom' # 返回所有行和所有列 airports.iloc[:,:] # 返回前2行所有值 airports.iloc[0:2,:] # 返回前2列所有值 airports.iloc[:,0:2] # 返回第1列和第3列所有值 airports.iloc[:,[0,2]] # 通过列名返回值 airports.loc[:,['Name','Country']] # 读写CSV文件 import pandas as pd # read_csv允许您将csv文件的内容读入DataFrame airports_df = pd.read_csv('Data/airports.csv') # 处理有错误的行 airports_df = pd.read_csv( 'Data/airportsInvalidRows.csv', error_bad_lines=False ) # 处理不包含列名的文件 airports_df = pd.read_csv( 'Data/airportsNoHeaderRows.csv', header=None ) # 使用names参数指定列名 airports_df = pd.read_csv( 'Data/airportsNoHeaderRows.csv', header=None, names=['Name', 'City', 'Country'] ) # 文件中缺少值,在DataFrame中显示为NaN # 将DataFrame内容写入CSV文件 airports_df.to_csv('Data/MyNewCSVFile.csv') # 如果不希望将索引列包含在csv文件中,请指定index=False airports_df.to_csv( 'Data/MyNewCSVFileNoIndex.csv', index=False ) # DataFrame 列操作 # 删除一列 DataFrameName.drop(columns=['columnname']) # 将Actual_arr_time列删除赋值给新df,但原df中Actual_arr_time列并未删除 new_df = delays_df.drop(columns=['Actual_arr_time']) # 使用inplace参数从原始数据中删除列 delays_df.drop(columns=['Actual_arr_time'], inplace=True) # 切片赋值 desc_df = delays_df.loc[:, ['Origin_airport','Dest_airport']] # 预处理缺失值和重复行 # 查看缺失值 delays.df.info() ''' <class 'pandas.core.frame.DataFrame'> RangeIndex: 300000 entries, 0 to 299999 Data columns (total 16 columns): FL_DATE 300000 non-null object OP_UNIQUE_CARRIER 300000 non-null object TAIL_NUM 299660 non-null object OP_CARRIER_FL_NUM 300000 non-null int64 ORIGIN 300000 non-null object DEST 300000 non-null object CRS_DEP_TIME 300000 non-null int64 DEP_TIME 296825 non-null float64 DEP_DELAY 296825 non-null float64 CRS_ARR_TIME 300000 non-null int64 ARR_TIME 296574 non-null float64 ARR_DELAY 295832 non-null float64 CRS_ELAPSED_TIME 300000 non-null int64 ACTUAL_ELAPSED_TIME 295832 non-null float64 AIR_TIME 295832 non-null float64 DISTANCE 300000 non-null int64 dtypes: float64(6), int64(5), object(5) memory usage: 30.9+ MB ''' TAIL_NUM、DEP_TIME、DEP_DELAY、ARR_DELAY、ACTUAL_ELAPSED_TIME和AIR_TIME有缺失值。 # 使用dropna删除包含空值/缺失值的行,原数据未真正删除 delay_no_nulls_df = delays_df.dropna() ''' <class 'pandas.core.frame.DataFrame'> Int64Index: 295832 entries, 0 to 299999 Data columns (total 16 columns): FL_DATE 295832 non-null object OP_UNIQUE_CARRIER 295832 non-null object TAIL_NUM 295832 non-null object OP_CARRIER_FL_NUM 295832 non-null int64 ORIGIN 295832 non-null object DEST 295832 non-null object CRS_DEP_TIME 295832 non-null int64 DEP_TIME 295832 non-null float64 DEP_DELAY 295832 non-null float64 CRS_ARR_TIME 295832 non-null int64 ARR_TIME 295832 non-null float64 ARR_DELAY 295832 non-null float64 CRS_ELAPSED_TIME 295832 non-null int64 ACTUAL_ELAPSED_TIME 295832 non-null float64 AIR_TIME 295832 non-null float64 DISTANCE 295832 non-null int64 dtypes: float64(6), int64(5), object(5) memory usage: 32.7+ MB ''' # inplace=True表示要删除df中原始数据 delays_df.dropna(inplace=True) ''' <class 'pandas.core.frame.DataFrame'> Int64Index: 295832 entries, 0 to 299999 Data columns (total 16 columns): FL_DATE 295832 non-null object OP_UNIQUE_CARRIER 295832 non-null object TAIL_NUM 295832 non-null object OP_CARRIER_FL_NUM 295832 non-null int64 ORIGIN 295832 non-null object DEST 295832 non-null object CRS_DEP_TIME 295832 non-null int64 DEP_TIME 295832 non-null float64 DEP_DELAY 295832 non-null float64 CRS_ARR_TIME 295832 non-null int64 ARR_TIME 295832 non-null float64 ARR_DELAY 295832 non-null float64 CRS_ELAPSED_TIME 295832 non-null int64 ACTUAL_ELAPSED_TIME 295832 non-null float64 AIR_TIME 295832 non-null float64 DISTANCE 295832 non-null int64 dtypes: float64(6), int64(5), object(5) memory usage: 32.7+ MB ''' # 查找重复值,显示该行是否与前一行重复 # 使用duplicated查找重复行 aipports_df.duplicated() # drop_duplicates将删除重复的行 airports_df.drop_duplicates(inplace=True) # Scikit-learn # 切分数据 import pandas as pd # 导入CSV delays_df = pd.read_csv('Data/Lots_of_flight_data.csv') # 查看数据的行列数 # delays_df.shape # 创建X DataFrame,其中只包含我们要用来训练模型的特性数据 X = delays_df.loc[:,['DISTANCE', 'CRS_ELAPSED_TIME']] # 创建y DataFrame,其中只包含我们要用模型预测的值 y = delays_df.loc[:,['ARR_DELAY']] # 切分训练数据和测试数据 ''' 使用Scikit-learn train_test_split将30%的数据放在测试DataFrame中 另外的70%的行放入训练DataFrame中,用来训练我们的模型 注意:通过为random_state指定一个值,我们可以确保如果再次运行代码,相同的行将被移动到测试数据帧中。 这使得我们的结果是可重复的。 ''' from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) # 训练线性回归模型 import pandas as pd from sklearn.model_selection import train_test_split # 加载CSV文件 delays_df = pd.read_csv('Data/Lots_of_flight_data.csv') # 删除具有空值的行 delays_df.dropna(inplace=True) # 将特性数据放进X DataFrame X = delays_df.loc[:,['DISTANCE', 'CRS_ELAPSED_TIME']] # 将标签数据放进y DataFrame y = delays_df.loc[:,['ARR_DELAY']] # 切分训练数据和测试数据 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) # 使用Scikit-learn LinearRegression.fit方法,根据X_train和y_train中存储的训练数据训练线性回归模型 from sklearn.linear_model import LinearRegression regressor = LinearRegression() # 创建Scikit-learn LinearRegression对象 regressor.fit(X_train, y_train) # 使用fit方法训练模型 # LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) # 这个regressor对象已经包含了经过训练的线性回归模型 # 测试模型 # 使用Scikit-learn LinearRegression predict让训练模型预测测试数据 # 将测试数据存储在X_Test # 将把预测的结果存储在y_pred y_pred = regressor.predict(X_test) # 可以看到预测出来的y_pred与y_test中还是存在差距,需要调整模型。 # 评估模型的准确性 ''' 现在我们已经有了一个经过训练的模型 可以使用许多方法来检查模型的准确性 但所有这些指标都是基于科学计算 我们可以使用Scikit-learn和numpy完成大部分工作 均方误差(MSE) MSE是模型在预测观察结果时执行的平均误差。MSE越低,模型越好。 MSE是实际观测值与模型预测值之间的平均平方差。 MSE = mean((actuals - predicteds)^2) ''' from sklearn import metrics print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) # Mean Squared Error: 2250.4445141530855 ''' 均方根误差(RMSE) RMSE是模型在预测观察结果时执行的平均误差。RMSE越低,模型越好。 从数学上讲,RMSE是均方误差的平方根 RMSE = sqrt(MSE) 我们可以使用包含大量数学函数的numpy库来计算MSE的平方根 ''' import numpy as np print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # NumPy和Pandas的选用 type(y_pred) # numpy.ndarray type(y_test) # pandas.core.frame.DataFrame # 一维numpy数组的创建类似于pandas Series # 但用了两个不同的结构体array()和Series() import numpy as np airports_array = np.array(['Pearson','Changi','Narita']) print(airports_array) print(airports_array[2]) # ['Pearson' 'Changi' 'Narita'] # Narita airports_series = pd.Series(['Pearson','Changi','Narita']) print(airports_series) print(airports_series[2]) # 0 Pearson # 1 Changi # 2 Narita # dtype: object # Narita # 二维数组 # 区别为打印时Pandas显示数字索引,NumPy为隐式的 airports_array = np.array([ ['YYZ','Pearson'], ['SIN','Changi'], ['NRT','Narita']]) print(airports_array) print(airports_array[0,0]) # [['YYZ' 'Pearson'] # ['SIN' 'Changi'] # ['NRT' 'Narita']] # YYZ airports_df = pd.DataFrame([['YYZ','Pearson'],['SIN','Changi'],['NRT','Narita']]) print(airports_df) print(airports_df.iloc[0,0]) # 0 1 # 0 YYZ Pearson # 1 SIN Changi # 2 NRT Narita # YYZ # 如果需要DataFrame的功能,可以将数据从numpy对象转换为pandas对象 # numpy和pandas是可以相互转换的,使用时多用type()来判断类型 # 这样就可以联合使用两个库中的所有功能 predicted_df = pd.DataFrame(y_pred) predicted_df.head()
最新回复(0)