第十一章:常用标准库
本章学习目标
- 掌握常用标准库的使用
- 学会使用 collections 模块
- 掌握 datetime 和 time 模块
- 了解 functools 和 itertools 模块
- 学会使用 re 进行正则表达式匹配
11.1 collections 模块
collections 模块提供了专门的数据类型,补充内置的 list、dict、set、tuple。
11.1.1 Counter
计数器,用于统计可哈希对象中元素出现的次数:
from collections import Counter
# 统计列表元素
colors = ["red", "blue", "red", "green", "blue", "blue"]
counter = Counter(colors)
print(counter) # Counter({'blue': 3, 'red': 2, 'green': 1})
# 统计字符串字符
text = "hello world"
char_counter = Counter(text)
print(char_counter) # Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, ...})
# 统计词频
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
word_counter = Counter(words)
print(word_counter) # Counter({'apple': 3, 'banana': 2, 'cherry': 1})
# most_common 方法
print(word_counter.most_common(2)) # [('apple', 3), ('banana', 2)]
# 运算
c1 = Counter([1, 2, 3, 2, 1])
c2 = Counter([2, 3, 4])
print(c1 + c2) # Counter({2: 3, 3: 2, 1: 2, 4: 1})
print(c1 - c2) # Counter({1: 2}) # 只保留正数
print(c1 & c2) # Counter({2: 1, 3: 1}) # 交集
print(c1 | c2) # Counter({1: 2, 2: 2, 3: 2, 4: 1}) # 并集
11.1.2 defaultdict
字典子类,提供默认值:
from collections import defaultdict
# 使用 int 作为默认值(计数器)
word_count = defaultdict(int)
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
for word in words:
word_count[word] += 1
print(dict(word_count)) # {'apple': 3, 'banana': 2, 'cherry': 1}
# 使用 list 作为默认值(分组)
grouped = defaultdict(list)
names = ["Alice", "Bob", "Charlie", "Anna", "Brian"]
for name in names:
grouped[name[0]].append(name)
print(dict(grouped)) # {'A': ['Alice', 'Anna'], 'B': ['Bob', 'Brian'], 'C': ['Charlie']}
# 使用 dict 作为默认值(嵌套字典)
nested = defaultdict(dict)
nested["person"]["age"] = 25
print(dict(nested)) # {'person': {'age': 25}}
# 使用 lambda 自定义默认值
custom = defaultdict(lambda: [0, 0])
custom["score"].append(100)
print(dict(custom)) # {'score': [0, 0, 100]}
11.1.3 deque
双端队列,支持从两端快速添加和删除:
from collections import deque
# 创建
dq = deque([1, 2, 3])
print(dq) # deque([1, 2, 3])
# 添加元素
dq.append(4) # 右端添加
dq.appendleft(0) # 左端添加
print(dq) # deque([0, 1, 2, 3, 4])
# 删除元素
dq.pop() # 右端删除
dq.popleft() # 左端删除
print(dq) # deque([1, 2, 3])
# 扩展
dq.extend([4, 5, 6])
dq.extendleft([-1, 0])
print(dq) # deque([-1, 0, 1, 2, 3, 4, 5, 6])
# 旋转
dq = deque([1, 2, 3, 4, 5])
dq.rotate(2) # 向右旋转2位
print(dq) # deque([4, 5, 1, 2, 3])
dq.rotate(-1) # 向左旋转1位
print(dq) # deque([5, 1, 2, 3, 4])
# 限制最大长度(自动删除旧元素)
bounded = deque(maxlen=3)
bounded.extend([1, 2, 3, 4, 5])
print(bounded) # deque([3, 4, 5], maxlen=3)
# 实际应用:滑动窗口
def sliding_window(data: list, size: int) -> list:
"""生成滑动窗口"""
window = deque(maxlen=size)
result = []
for item in data:
window.append(item)
if len(window) == size:
result.append(list(window))
return result
print(sliding_window([1, 2, 3, 4, 5], 3)) # [[1, 2, 3], [2, 3, 4], [3, 4, 5]]
11.1.4 namedtuple
创建带命名的元组子类:
from collections import namedtuple
# 定义 namedtuple
Point = namedtuple("Point", ["x", "y"])
Point3D = namedtuple("Point3D", "x y z")
Person = namedtuple("Person", "name age city")
# 使用
p = Point(10, 20)
print(p.x, p.y) # 10 20
print(p[0], p[1]) # 10 20
print(p._fields) # ('x', 'y')
# 转换为字典
print(p._asdict()) # OrderedDict([('x', 10), ('y', 20)])
# 替换字段
p2 = p._replace(x=100)
print(p2) # Point(x=100, y=20)
# 从序列创建
p3 = Point._make([30, 40])
print(p3) # Point(x=30, y=40)
# 继承 namedtuple
class Point(namedtuple("Point", ["x", "y"])):
@property
def distance(self) -> float:
return (self.x ** 2 + self.y ** 2) ** 0.5
p = Point(3, 4)
print(p.distance) # 5.0
11.1.5 OrderedDict
有序字典(Python 3.7+ 普通 dict 已保证顺序):
from collections import OrderedDict
# 创建有序字典
d = OrderedDict()
d["a"] = 1
d["b"] = 2
d["c"] = 3
print(d) # OrderedDict([('a', 1), ('b', 2), ('c', 3)])
# 移动到末尾
d.move_to_end("a")
print(d) # OrderedDict([('b', 2), ('c', 3), ('a', 1)])
# 移动到开头
d.move_to_end("b", last=False)
print(d) # OrderedDict([('b', 2), ('c', 3), ('a', 1)])
11.1.6 ChainMap
将多个映射合并为一个视图:
from collections import ChainMap
# 合并多个字典
dict1 = {"a": 1, "b": 2}
dict2 = {"b": 3, "c": 4}
dict3 = {"c": 5, "d": 6}
# ChainMap 查找顺序:maps[0] -> maps[1] -> ...
chain = ChainMap(dict1, dict2, dict3)
print(chain["a"]) # 1 (在 dict1 中)
print(chain["b"]) # 2 (在 dict1 中,优先)
print(chain["c"]) # 4 (在 dict2 中)
print(chain["d"]) # 6 (在 dict3 中)
# 修改只影响第一个映射
chain["a"] = 100
print(dict1) # {'a': 100, 'b': 2}
# maps 属性
print(chain.maps) # [dict1, dict2, dict3]
# new_child 创建新视图
child = chain.new_child({"e": 7})
print(child["e"]) # 7
print(child["a"]) # 100
11.2 datetime 模块
11.2.1 datetime 类
from datetime import datetime, date, time, timedelta
# 获取当前时间
now = datetime.now()
today = datetime.today()
print(now) # 2024-01-15 10:30:45.123456
print(today) # 2024-01-15 00:00:00
# 创建特定时间
dt = datetime(2024, 1, 15, 10, 30, 0)
d = date(2024, 1, 15)
t = time(10, 30, 0)
print(dt) # 2024-01-15 10:30:00
# 访问组件
print(dt.year) # 2024
print(dt.month) # 1
print(dt.day) # 15
print(dt.hour) # 10
print(dt.minute) # 30
print(dt.second) # 0
# 星期几(0=Monday, 6=Sunday)
print(dt.weekday()) # 0 (Monday)
print(dt.isoweekday()) # 1 (Monday)
11.2.2 格式化与解析
from datetime import datetime
dt = datetime(2024, 1, 15, 10, 30, 45)
# 格式化
formatted = dt.strftime("%Y-%m-%d %H:%M:%S")
print(formatted) # 2024-01-15 10:30:45
# 常用格式
print(dt.strftime("%Y/%m/%d")) # 2024/01/15
print(dt.strftime("%H:%M:%S")) # 10:30:45
print(dt.strftime("%A, %B %d, %Y")) # Monday, January 15, 2024
# 解析
dt2 = datetime.strptime("2024-01-15 10:30:45", "%Y-%m-%d %H:%M:%S")
print(dt2) # 2024-01-15 10:30:45
# ISO 格式
dt_iso = datetime.fromisoformat("2024-01-15T10:30:45")
print(dt_iso) # 2024-01-15 10:30:45
11.2.3 timedelta
时间增量,用于日期时间运算:
from datetime import datetime, timedelta
# 创建 timedelta
one_day = timedelta(days=1)
one_hour = timedelta(hours=1)
half_hour = timedelta(minutes=30)
# 日期运算
now = datetime.now()
tomorrow = now + one_day
yesterday = now - one_day
print(f"明天: {tomorrow}")
print(f"昨天: {yesterday}")
# 时间差
delta = datetime(2024, 1, 20) - datetime(2024, 1, 15)
print(delta.days) # 5
print(delta.seconds) # 0
print(delta.total_seconds()) # 432000.0
# 计算工作日
def business_days(start: datetime, end: datetime) -> int:
"""计算两个日期之间的工作日天数"""
count = 0
current = start
while current <= end:
if current.weekday() < 5: # 周一到周五
count += 1
current += timedelta(days=1)
return count
start = datetime(2024, 1, 1)
end = datetime(2024, 1, 10)
print(business_days(start, end)) # 6
11.3 functools 模块
11.3.1 lru_cache
函数结果缓存:
from functools import lru_cache
# 使用装饰器
@lru_cache(maxsize=128)
def fibonacci(n: int) -> int:
if n < 2:
return n
return fibonacci(n - 1) + fibonacci(n - 2)
# 不使用缓存:指数级复杂度
# 使用缓存:线性复杂度
print(fibonacci(100)) # 很快
# 查看缓存信息
print(fibonacci.cache_info())
# CacheInfo(hits=98, misses=101, maxsize=128, currsize=101)
# 清除缓存
fibonacci.cache_clear()
11.3.2 partial
创建偏函数,固定部分参数:
from functools import partial
def power(base, exponent):
return base ** exponent
# 创建偏函数
square = partial(power, exponent=2)
cube = partial(power, exponent=3)
root = partial(power, exponent=0.5)
print(square(5)) # 25
print(cube(5)) # 125
print(root(16)) # 4.0
# 与内置函数结合
int_binary = partial(int, base=2)
print(int_binary("1010")) # 10
# 使用 sorted 的 key
from functools import partial
sort_by_second = partial(sorted, key=lambda x: x[1])
data = [(1, 3), (2, 1), (3, 2)]
print(sort_by_second(data)) # [(2, 1), (3, 2), (1, 3)]
11.3.3 reduce
对序列进行累积操作:
from functools import reduce
from operator import add, mul
# 基本用法
result = reduce(add, [1, 2, 3, 4, 5])
print(result) # 15
# 带初始值
result = reduce(add, [1, 2, 3, 4, 5], 10)
print(result) # 25
# 实现阶乘
def factorial(n: int) -> int:
return reduce(mul, range(1, n + 1), 1)
print(factorial(5)) # 120
# 实现 max
def my_max(seq):
return reduce(lambda a, b: a if a > b else b, seq)
print(my_max([3, 1, 4, 1, 5, 9])) # 9
11.3.4 wraps
保留被装饰函数的元数据:
from functools import wraps
def my_decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
print("调用前")
result = func(*args, **kwargs)
print("调用后")
return result
return wrapper
@my_decorator
def example():
"""这是文档字符串"""
return 42
print(example.__name__) # example (不是 wrapper!)
print(example.__doc__) # 这是文档字符串
11.3.5 singledispatch
函数重载(单分派):
from functools import singledispatch
@singledispatch
def process(data):
print(f"处理数据: {data}")
@process.register(int)
def process_int(data):
print(f"处理整数: {data * 2}")
@process.register(str)
def process_str(data):
print(f"处理字符串: {data.upper()}")
@process.register(list)
def process_list(data):
print(f"处理列表: {len(data)} 项")
# 使用
process(42) # 处理整数: 84
process("hello") # 处理字符串: HELLO
process([1, 2, 3]) # 处理列表: 3 项
11.4 itertools 模块
11.4.1 无限迭代器
import itertools
# count(start, step) - 无限计数器
for i in itertools.count(10, 2):
if i > 20:
break
print(i, end=" ") # 10 12 14 16 18 20
# cycle(iterable) - 无限循环
colors = itertools.cycle(["red", "green", "blue"])
for _ in range(5):
print(next(colors), end=" ") # red green blue red green
# repeat(elem, n) - 重复
for _ in range(3):
print(next(itertools.repeat(5))) # 5 5 5
11.4.2 有限迭代器
import itertools
# accumulate - 累积
result = list(itertools.accumulate([1, 2, 3, 4, 5]))
print(result) # [1, 3, 6, 10, 15]
# 自定义累积函数
import operator
result = list(itertools.accumulate([1, 2, 3, 4, 5], operator.mul))
print(result) # [1, 2, 6, 24, 120]
# chain - 连接
result = list(itertools.chain([1, 2], [3, 4], [5]))
print(result) # [1, 2, 3, 4, 5]
# chain.from_iterable - 扁平化
result = list(itertools.chain.from_iterable([[1, 2], [3, 4], [5]]))
print(result) # [1, 2, 3, 4, 5]
# compress - 过滤
result = list(itertools.compress("ABCDEF", [1, 0, 1, 0, 1, 1]))
print(result) # ['A', 'C', 'E', 'F']
# dropwhile, takewhile
result = list(itertools.dropwhile(lambda x: x < 3, [1, 2, 3, 4, 5]))
print(result) # [3, 4, 5]
result = list(itertools.takewhile(lambda x: x < 3, [1, 2, 3, 4, 5]))
print(result) # [1, 2]
# filterfalse
result = list(itertools.filterfalse(lambda x: x % 2 == 0, [1, 2, 3, 4, 5]))
print(result) # [1, 3, 5]
11.4.3 组合迭代器
import itertools
# product - 笛卡尔积
result = list(itertools.product([1, 2], ["a", "b"]))
print(result) # [(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b')]
# 计算 A×B×C
result = list(itertools.product([1, 2, 3], repeat=2))
print(result) # [(1, 1), (1, 2), (1, 3), (2, 1), ...]
# permutations - 排列
result = list(itertools.permutations([1, 2, 3], 2))
print(result) # [(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]
# combinations - 组合
result = list(itertools.combinations([1, 2, 3], 2))
print(result) # [(1, 2), (1, 3), (2, 3)]
# combinations_with_replacement - 带重复的组合
result = list(itertools.combinations_with_replacement([1, 2, 3], 2))
print(result) # [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
11.5 re 模块(正则表达式)
11.5.1 基本匹配
import re
# search - 查找第一个匹配
text = "我的邮箱是 test@example.com"
match = re.search(r"[\w.-]+@[\w.-]+", text)
if match:
print(match.group()) # test@example.com
# match - 从字符串开头匹配
print(re.match(r"\d+", "123abc")) # <re.Match object>
print(re.match(r"\d+", "abc123")) # None
# fullmatch - 整个字符串匹配
print(re.fullmatch(r"\d+", "123")) # <re.Match object>
print(re.fullmatch(r"\d+", "123a")) # None
# findall - 查找所有匹配
text = "有 1 个苹果,2 个香蕉,3 个橙子"
numbers = re.findall(r"\d+", text)
print(numbers) # ['1', '2', '3']
11.5.2 元字符
import re
# . - 任意字符(除换行)
re.search(r"a.c", "abc") # 匹配
re.search(r"a.c", "aXc") # 匹配
# \d \D - 数字/非数字
re.search(r"\d+", "abc123") # 123
re.search(r"\D+", "123abc") # abc
# \w \W - 单词字符/非单词字符
re.search(r"\w+", "hello_world") # hello_world
# \s \S - 空白/非空白
re.search(r"\s+", "hello world") # 空格
# ^ $ - 开头/结尾
re.search(r"^hello", "hello world") # hello
re.search(r"world$", "hello world") # world
# [] - 字符集
re.search(r"[aeiou]", "hello") # e
re.search(r"[0-9]", "a1b2") # 1
# | - 或
re.search(r"cat|dog", "I have a cat") # cat
11.5.3 量词
import re
# * - 0次或多次
re.search(r"ab*c", "ac") # ac
re.search(r"ab*c", "abc") # abc
re.search(r"ab*c", "abbbc") # abbbc
# + - 1次或多次
re.search(r"ab+c", "ac") # None
re.search(r"ab+c", "abc") # abc
# ? - 0次或1次
re.search(r"colou?r", "color") # color
re.search(r"colou?r", "colour") # colour
# {n} - 恰好n次
re.search(r"\d{4}", "2024") # 2024
# {n,} - 至少n次
re.search(r"\d{2,}", "123") # 123
# {n,m} - n到m次
re.search(r"\d{2,4}", "12345") # 1234
# *? +? ?? - 非贪婪
re.search(r"<.*?>", "<html><body>") # <html>
re.search(r"<.*>", "<html><body>") # <html><body>
11.5.4 分组和捕获
import re
# () - 分组
match = re.search(r"(\d{4})-(\d{2})-(\d{2})", "2024-01-15")
print(match.group()) # 2024-01-15
print(match.group(1)) # 2024
print(match.group(2)) # 01
print(match.group(3)) # 15
print(match.groups()) # ('2024', '01', '15')
# 命名分组
match = re.search(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", "2024-01-15")
print(match.group("year")) # 2024
print(match.groupdict()) # {'year': '2024', 'month': '01', 'day': '15'}
# (?:) - 不捕获分组
match = re.search(r"(?:ab)+", "ababab")
print(match.group()) # ababab
# \1 \2 - 反向引用
match = re.search(r"(\w+) \1", "hello hello")
print(match.group()) # hello hello
# | - 或(分支)
match = re.search(r"(cat|dog)", "I have a cat")
print(match.group()) # cat
11.5.5 替换
import re
# sub - 替换
text = "我的邮箱是 test@example.com"
result = re.sub(r"[\w.-]+@[\w.-]+", "[email]", text)
print(result) # 我的邮箱是 [email]
# sub 使用函数
def convert_date(match):
year, month, day = match.groups()
return f"{day}/{month}/{year}"
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", convert_date, "2024-01-15")
print(result) # 15/01/2024
# split - 分割
result = re.split(r"[,;\s]+", "a,b;c d;e")
print(result) # ['a', 'b', 'c', 'd', 'e']
11.5.6 编译和标志
import re
# 编译正则表达式(提高性能)
pattern = re.compile(r"\d+", re.IGNORECASE)
# 使用编译后的模式
match = pattern.search("123ABC")
print(match.group()) # 123
# 常用标志
# re.IGNORECASE / re.I - 忽略大小写
# re.MULTILINE / re.M - 多行模式
# re.DOTALL / re.S - 点号匹配所有
# re.VERBOSE / re.X - 详细模式
# 详细模式(可以添加注释)
pattern = re.compile(r"""
\d{4} # 年份
- # 分隔符
\d{2} # 月份
- # 分隔符
\d{2} # 日期
""", re.VERBOSE)
11.6 logging 模块
11.6.1 基础配置
import logging
# 基础配置
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
# 获取 logger
logger = logging.getLogger(__name__)
# 使用
logger.debug("调试信息")
logger.info("普通信息")
logger.warning("警告信息")
logger.error("错误信息")
logger.critical("严重错误")
11.6.2 详细配置
import logging
from logging.handlers import RotatingFileHandler
# 创建 logger
logger = logging.getLogger("my_app")
logger.setLevel(logging.DEBUG)
# 控制台处理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_formatter = logging.Formatter(
"%(levelname)s - %(message)s"
)
console_handler.setFormatter(console_formatter)
# 文件处理器(带轮转)
file_handler = RotatingFileHandler(
"app.log",
maxBytes=1024 * 1024, # 1MB
backupCount=5,
encoding="utf-8"
)
file_handler.setLevel(logging.DEBUG)
file_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
file_handler.setFormatter(file_formatter)
# 添加处理器
logger.addHandler(console_handler)
logger.addHandler(file_handler)
# 使用
logger.info("信息")
logger.error("错误")
综合示例
示例 1:文本统计工具
from collections import Counter
import re
def text_statistics(text: str) -> dict:
"""统计文本的各种信息"""
# 字符数
char_count = len(text)
# 单词数
words = re.findall(r"\w+", text.lower())
word_count = len(words)
# 词频统计
word_freq = Counter(words)
# 行数
line_count = len(text.split("\n"))
# 句子数
sentence_count = len(re.split(r"[.!?]+", text))
return {
"字符数": char_count,
"单词数": word_count,
"词数(去重)": len(word_freq),
"最常用词": word_freq.most_common(5),
"行数": line_count,
"句子数": sentence_count
}
# 测试
text = """
Python is a high-level programming language.
Python is easy to learn and powerful.
Many developers love Python.
"""
result = text_statistics(text)
for key, value in result.items():
print(f"{key}: {value}")
示例 2:日期时间工具
from datetime import datetime, timedelta
def date_range(start: datetime, end: datetime) -> list[datetime]:
"""生成日期范围"""
dates = []
current = start
while current <= end:
dates.append(current)
current += timedelta(days=1)
return dates
def format_duration(seconds: float) -> str:
"""格式化时长"""
if seconds < 60:
return f"{seconds:.0f}秒"
elif seconds < 3600:
minutes = seconds / 60
return f"{minutes:.1f}分钟"
elif seconds < 86400:
hours = seconds / 3600
return f"{hours:.1f}小时"
else:
days = seconds / 86400
return f"{days:.1f}天"
# 测试
start = datetime(2024, 1, 1)
end = datetime(2024, 1, 10)
dates = date_range(start, end)
print(f"日期范围: {len(dates)} 天")
print(format_duration(45)) # 45秒
print(format_duration(125)) # 2.1分钟
print(format_duration(3700)) # 1.0小时
print(format_duration(90000)) # 1.0天
最佳实践
使用 Counter 统计:高效简洁
使用 defaultdict 处理缺失键:避免 KeyError
使用 deque 实现队列和滑动窗口:O(1) 操作
使用 functools.lru_cache 缓存函数结果:提高性能
使用 itertools 生成组合:节省内存
使用正则表达式处理复杂文本:灵活强大
课后练习
练习 11.1:Counter 使用
使用 Counter 统计文本中每个字符的出现次数。
练习 11.2:日期计算
实现一个函数,计算两个日期之间的工作日数。
练习 11.3:lru_cache
使用 @lru_cache 实现斐波那契数列函数。
练习 11.4:正则表达式
使用正则表达式验证邮箱格式、手机号。
练习 11.5:itertools
使用 itertools 生成所有可能的密码组合(字母+数字)。
本章小结
本章我们详细学习了 Python 常用标准库:
collections 模块:
- Counter 计数器
- defaultdict 默认字典
- deque 双端队列
- namedtuple 命名元组
datetime 模块:
- datetime、date、time
- timedelta 时间增量
- 格式化和解析
functools 模块:
- lru_cache 缓存
- partial 偏函数
- reduce 累积
itertools 模块:
- 无限迭代器
- 组合迭代器
re 模块:
- 正则表达式匹配
- 分组和替换
logging 模块:
- 日志配置
这些标准库是 Python 编程的基础,熟练使用可以大大提高开发效率。