引言
Python作为一门高级编程语言,内置了多种强大的数据结构,包括列表、字典、集合和元组等。这些数据结构是Python编程的基础,掌握它们的高级用法和内部实现原理,对于编写高效、优雅的代码至关重要。本文将深入探讨Python中的高级数据结构,包括它们的内部实现、性能特性以及在实际项目中的应用。
列表(List)的高级应用
列表推导式与生成器表达式
列表推导式是Python中一种简洁且高效的创建列表的方法。它不仅可以替代传统的for循环,还能在某些场景下提供更好的性能。
# 传统方式创建列表
squares = []
for x in range(10):
squares.append(x**2)
# 使用列表推导式
squares = [x**2 for x in range(10)]
print(squares) # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# 带条件的列表推导式
even_squares = [x**2 for x in range(10) if x % 2 == 0]
print(even_squares) # [0, 4, 16, 36, 64]
# 嵌套列表推导式
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened = [num for row in matrix for num in row]
print(flattened) # [1, 2, 3, 4, 5, 6, 7, 8, 9]
生成器表达式与列表推导式类似,但它返回一个生成器对象,可以节省内存,特别适合处理大数据集。
# 生成器表达式
squares_gen = (x**2 for x in range(10))
print(squares_gen) # <generator object <genexpr> at 0x...>
# 逐个获取值
for num in squares_gen:
print(num, end=' ')
# 0 1 4 9 16 25 36 49 64 81
列表的切片操作
切片是Python中非常强大的功能,可以用于复制列表、反转列表、获取子列表等。
my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# 基本切片
print(my_list[2:5]) # [2, 3, 4]
print(my_list[:3]) # [0, 1, 2]
print(my_list[6:]) # [6, 7, 8, 9]
print(my_list[::2]) # [0, 2, 4, 6, 8] 步长为2
# 反转列表
reversed_list = my_list[::-1]
print(reversed_list) # [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
# 复制列表
copied_list = my_list[:]
print(copied_list) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
列表的性能特性
列表在Python中是动态数组实现的,这意味着:
- 随机访问(通过索引)的时间复杂度是O(1)
- 在末尾添加元素(append)的平均时间复杂度是O(1)
- 在中间或开头插入元素(insert)的时间复杂度是O(n)
- 删除元素(remove, pop(i))的时间复杂度是O(n)
import time
# 比较append和insert的性能
def test_append(n):
lst = []
for i in range(n):
lst.append(i)
return lst
def test_insert(n):
lst = []
for i in range(n):
lst.insert(0, i) # 每次都在开头插入
return lst
n = 10000
start = time.time()
test_append(n)
print(f"Append took: {time.time() - start:.6f} seconds")
start = time.time()
test_insert(n)
print(f"Insert took: {time.time() - start:.6f} seconds")
字典(Dictionary)的高级应用
字典推导式
与列表推导式类似,字典推导式可以简洁地创建字典。
# 基本字典推导式
squares_dict = {x: x**2 for x in range(6)}
print(squares_dict) # {0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25}
# 交换键值
original = {'a': 1, 'b': 2, 'c': 3}
swapped = {v: k for k, v in original.items()}
print(swapped) # {1: 'a', 2: 'b', 3: 'c'}
# 过滤字典
prices = {'apple': 0.5, 'banana': 0.25, 'orange': 0.75, 'milk': 1.5}
expensive = {k: v for k, v in prices.items() if v >= 0.75}
print(expensive) # {'orange': 0.75, 'milk': 1.5}
defaultdict和OrderedDict
collections模块提供了字典的增强版本。
from collections import defaultdict, OrderedDict
# defaultdict: 当键不存在时返回默认值
dd = defaultdict(int) # 默认值为0
dd['count'] += 1
print(dd) # defaultdict(<class 'int'>, {'count': 1})
# 按字母统计单词出现次数
text = "apple banana apple cherry banana apple"
word_counts = defaultdict(int)
for word in text.split():
word_counts[word] += 1
print(dict(word_counts)) # {'apple': 3, 'banana': 2, 'cherry': 1}
# OrderedDict: 保持插入顺序
od = OrderedDict()
od['first'] = 1
od['second'] = 2
od['third'] = 3
print(od) # OrderedDict([('first', 1), ('second', 2), ('third', 3)])
# Python 3.7+ 普通字典也保持插入顺序
regular_dict = {'first': 1, 'second': 2, 'third': 3}
print(regular_dict) # {'first': 1, 'second': 2, 'third': 3}
字典的性能特性
字典在Python中是通过哈希表实现的:
- 查找、插入、删除的平均时间复杂度是O(1)
- 最坏情况是O(n),但很少发生
- 内存占用相对较大
import time
import random
# 测试字典查找性能
def test_dict_lookup(n):
d = {i: i**2 for i in range(n)}
start = time.time()
for _ in range(10000):
key = random.randint(0, n-1)
_ = d[key]
return time.time() - start
# 测试列表查找性能
def test_list_lookup(n):
lst = list(range(n))
start = time.time()
for _ in range(10000):
key = random.randint(0, n-1)
try:
_ = lst.index(key)
except ValueError:
pass
return time.time() - start
n = 10000
print(f"Dict lookup time: {test_dict_lookup(n):.6f}")
print(f"List lookup time: {test_list_lookup(n):.6f}")
集合(Set)的高级应用
集合运算
集合支持数学上的集合运算,非常适合用于数据去重和关系运算。
# 集合的基本操作
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# 并集
print(A | B) # {1, 2, 3, 4, 5, 6, 7, 8}
print(A.union(B)) # 同上
# 交集
print(A & B) # {4, 5}
print(A.intersection(B)) # 同上
# 差集
print(A - B) # {1, 2, 3}
print(A.difference(B)) # 同上
# 对称差集(只属于其中一个集合的元素)
print(A ^ B) # {1, 2, 3, 6, 7, 8}
print(A.symmetric_difference(B)) # 同上
# 判断子集和超集
C = {2, 3}
print(C <= A) # True, C是A的子集
print(A >= C) # True, A是C的超集
集合推导式
# 创建数字的平方集合
squares_set = {x**2 for x in range(-5, 6)}
print(squares_set) # {0, 1, 4, 9, 16, 25}
# 从列表中提取唯一元素
numbers = [1, 2, 2, 3, 4, 4, 5, 1, 2]
unique_numbers = set(numbers)
print(unique_numbers) # {1, 2, 3, 4, 5}
# 集合推导式与条件
even_squares = {x**2 for x in range(10) if x % 2 == 0}
print(even_squares) # {0, 4, 16, 36, 64}
集合的性能特性
集合与字典类似,也是基于哈希表实现:
- 添加、删除、查找的平均时间复杂度是O(1)
- 集合运算(如并集、交集)的时间复杂度取决于集合大小
import time
def test_set_operations(n):
# 创建两个大集合
set1 = set(range(0, n, 2)) # 偶数
set2 = set(range(1, n, 2)) # 奇数
# 测试并集性能
start = time.time()
union = set1 | set2
union_time = time.time() - start
# 测试交集性能
start = time.time()
intersection = set1 & set2
intersection_time = time.time() - start
return union_time, intersection_time
n = 100000
union_t, inter_t = test_set_operations(n)
print(f"Union of {n} elements took: {union_t:.6f} seconds")
print(f"Intersection of {n} elements took: {inter_t:.6f} seconds")
元组(Tuple)的高级应用
元组拆包和多变量赋值
元组在Python中常用于函数返回多个值和多变量赋值。
# 基本拆包
point = (3, 4)
x, y = point
print(f"x={x}, y={y}") # x=3, y=4
# 交换变量值
a, b = 1, 2
a, b = b, a
print(f"a={a}, b={b}") # a=2, b=1
# 函数返回多个值
def get_user_info():
return "Alice", 25, "alice@example.com"
name, age, email = get_user_info()
print(f"Name: {name}, Age: {age}, Email: {email}")
# 带星号的拆包
first, *middle, last = [1, 2, 3, 4, 5]
print(first) # 1
print(middle) # [2, 3, 4]
print(last) # 5
命名元组(namedtuple)
collections.namedtuple为元组元素提供名称,使代码更具可读性。
from collections import namedtuple
# 定义命名元组
Point = namedtuple('Point', ['x', 'y'])
p = Point(3, 4)
print(p.x, p.y) # 3 4
# 与普通元组比较
regular_tuple = (3, 4)
print(regular_tuple[0]) # 3
print(p[0]) # 3
print(p.x) # 3 (更具可读性)
# 在实际项目中的应用
Employee = namedtuple('Employee', ['id', 'name', 'department', 'salary'])
employees = [
Employee(1, 'Alice', 'Engineering', 90000),
Employee(2, 'Bob', 'Marketing', 75000),
Employee(3, 'Charlie', 'Engineering', 95000)
]
# 计算平均工资
avg_salary = sum(e.salary for e in employees) / len(employees)
print(f"Average salary: ${avg_salary:.2f}")
元组的性能特性
元组是不可变的,因此:
- 创建速度比列表快
- 可以作为字典的键(因为不可变)
- 内存占用比列表小
import time
import sys
# 比较创建速度
def test_tuple_creation(n):
start = time.time()
for i in range(n):
t = (i, i+1, i+2)
return time.time() - start
def test_list_creation(n):
start = time.time()
for i in range(n):
lst = [i, i+1, i+2]
return time.time() - start
n = 1000000
tuple_time = test_tuple_creation(n)
list_time = test_list_creation(n)
print(f"Tuple creation: {tuple_time:.6f} seconds")
print(f"List creation: {list_time:.6f} seconds")
# 比较内存占用
t = (1, 2, 3, 4, 5)
lst = [1, 2, 3, 4, 5]
print(f"Tuple size: {sys.getsizeof(t)} bytes")
print(f"List size: {sys.getsizeof(lst)} bytes")
实际项目中的应用案例
案例1:数据处理管道
from collections import defaultdict, namedtuple
from typing import List, Dict
# 定义数据结构
Record = namedtuple('Record', ['timestamp', 'user_id', 'action', 'value'])
# 模拟日志数据
logs = [
Record('2024-01-01 10:00', 'user1', 'login', 1),
Record('2024-01-01 10:05', 'user1', 'purchase', 50),
Record('2024-01-01 10:10', 'user2', 'login', 1),
Record('2024-01-01 10:15', 'user1', 'purchase', 30),
Record('2024-01-01 10:20', 'user2', 'purchase', 20),
]
def process_logs(logs: List[Record]) -> Dict:
"""处理日志数据,统计用户行为"""
# 使用defaultdict统计每个用户的总消费
user_spending = defaultdict(float)
# 使用集合记录活跃用户
active_users = set()
for log in logs:
active_users.add(log.user_id)
if log.action == 'purchase':
user_spending[log.user_id] += log.value
# 使用字典推导式计算平均消费
avg_spending = {
user: total / logs.count(user)
for user, total in user_spending.items()
}
return {
'total_users': len(active_users),
'total_revenue': sum(user_spending.values()),
'user_spending': dict(user_spending),
'avg_spending': avg_spending
}
result = process_logs(logs)
print("Processing Results:")
for key, value in result.items():
print(f" {key}: {value}")
案例2:图算法中的应用
from collections import defaultdict, deque
class Graph:
"""使用字典和集合实现图数据结构"""
def __init__(self):
self.adjacency = defaultdict(set) # 邻接表
def add_edge(self, u, v):
"""添加边"""
self.adjacency[u].add(v)
self.adjacency[v].add(u) # 无向图
def bfs(self, start):
"""广度优先搜索"""
visited = set()
queue = deque([start])
visited.add(start)
result = []
while queue:
node = queue.popleft()
result.append(node)
for neighbor in self.adjacency[node]:
if neighbor not in visited:
visited.add(neighbor)
queue.append(neighbor)
return result
def find_all_paths(self, start, end, path=None):
"""找到所有路径(DFS)"""
if path is None:
path = [start]
if start == end:
return [path]
paths = []
for node in self.adjacency[start] - set(path):
new_paths = self.find_all_paths(node, end, path + [node])
paths.extend(new_paths)
return paths
# 使用示例
g = Graph()
g.add_edge('A', 'B')
g.add_edge('A', 'C')
g.add_edge('B', 'D')
g.add_edge('C', 'D')
g.add_edge('D', 'E')
print("BFS from A:", g.bfs('A'))
print("All paths from A to E:", g.find_all_paths('A', 'E'))
案例3:缓存系统实现
from functools import wraps
from collections import OrderedDict
import time
class LRUCache:
"""使用OrderedDict实现LRU缓存"""
def __init__(self, capacity: int):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key: str) -> int:
if key not in self.cache:
return -1
# 将访问的元素移到末尾(最近使用)
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key: str, value: int) -> None:
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
# 弹出最久未使用的元素(第一个)
self.cache.popitem(last=False)
# 装饰器版本
def lru_cache(maxsize=128):
def decorator(func):
cache = OrderedDict()
@wraps(func)
def wrapper(*args, **kwargs):
key = str(args) + str(kwargs)
if key in cache:
cache.move_to_end(key)
return cache[key]
result = func(*args, **kwargs)
if len(cache) >= maxsize:
cache.popitem(last=False)
cache[key] = result
return result
return wrapper
return decorator
# 使用示例
@lru_cache(maxsize=3)
def expensive_computation(x):
time.sleep(0.1) # 模拟耗时计算
return x * x
print(expensive_computation(2)) # 计算并缓存
print(expensive_computation(2)) # 直接从缓存返回
print(expensive_computation(3)) # 计算并缓存
print(expensive_computation(4)) # 计算并缓存,缓存已满,淘汰最旧的
性能优化技巧
1. 选择合适的数据结构
# 检查元素是否存在:使用集合而不是列表
def check_performance():
large_list = list(range(100000))
large_set = set(large_list)
import time
# 列表查找(线性时间)
start = time.time()
_ = 99999 in large_list
list_time = time.time() - start
# 集合查找(常数时间)
start = time.time()
_ = 99999 in large_set
set_time = time.time() - start
print(f"List lookup: {list_time:.6f} seconds")
print(f"Set lookup: {set_time:.6f} seconds")
print(f"Speedup: {list_time/set_time:.2f}x")
check_performance()
2. 使用生成器处理大数据
# 传统方式:一次性加载所有数据到内存
def process_large_file_traditional(filename):
with open(filename) as f:
lines = f.readlines() # 所有行加载到内存
return [line.strip() for line in lines if len(line) > 10]
# 生成器方式:逐行处理
def process_large_file_generator(filename):
with open(filename) as f:
for line in f:
if len(line) > 10:
yield line.strip()
# 使用示例
# for processed_line in process_large_file_generator("large_file.txt"):
# print(processed_line)
3. 使用字典替代多个if-else
# 传统方式
def handle_action_traditional(action):
if action == 'create':
return "Creating..."
elif action == 'update':
return "Updating..."
elif action == 'delete':
return "Deleting..."
else:
return "Unknown action"
# 字典映射方式
def handle_action_dict(action):
actions = {
'create': lambda: "Creating...",
'update': lambda: "Updating...",
'delete': lambda: "Deleting..."
}
return actions.get(action, lambda: "Unknown action")()
# 更高级的字典映射(带参数)
def create_item(name):
return f"Creating {name}"
def update_item(name):
return f"Updating {name}"
def delete_item(name):
return f"Deleting {name}"
action_handlers = {
'create': create_item,
'update': update_item,
'delete': delete_item
}
def handle_item(action, name):
return action_handlers.get(action, lambda x: "Unknown action")(name)
print(handle_item('create', 'book')) # Creating book
总结
Python的高级数据结构是编写高效、优雅代码的基础。通过深入理解它们的内部实现和性能特性,我们可以在实际项目中做出更明智的选择:
- 列表:适合有序数据,动态数组实现,随机访问快,中间插入慢
- 字典:键值对存储,哈希表实现,查找速度快,适合快速检索
- 集合:唯一元素存储,哈希表实现,适合去重和集合运算
- 元组:不可变序列,创建快,内存小,适合作为字典键和函数返回值
在实际开发中,结合列表推导式、字典推导式、生成器表达式等高级特性,可以写出更加简洁、高效的代码。同时,根据具体场景选择合适的数据结构,是每个Python开发者应该掌握的重要技能。# Python中的高级数据结构:从基础到实战应用
引言
Python作为一门高级编程语言,内置了多种强大的数据结构,包括列表、字典、集合和元组等。这些数据结构是Python编程的基础,掌握它们的高级用法和内部实现原理,对于编写高效、优雅的代码至关重要。本文将深入探讨Python中的高级数据结构,包括它们的内部实现、性能特性以及在实际项目中的应用。
列表(List)的高级应用
列表推导式与生成器表达式
列表推导式是Python中一种简洁且高效的创建列表的方法。它不仅可以替代传统的for循环,还能在某些场景下提供更好的性能。
# 传统方式创建列表
squares = []
for x in range(10):
squares.append(x**2)
# 使用列表推导式
squares = [x**2 for x in range(10)]
print(squares) # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# 带条件的列表推导式
even_squares = [x**2 for x in range(10) if x % 2 == 0]
print(even_squares) # [0, 4, 16, 36, 64]
# 嵌套列表推导式
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened = [num for row in matrix for num in row]
print(flattened) # [1, 2, 3, 4, 5, 6, 7, 8, 9]
生成器表达式与列表推导式类似,但它返回一个生成器对象,可以节省内存,特别适合处理大数据集。
# 生成器表达式
squares_gen = (x**2 for x in range(10))
print(squares_gen) # <generator object <genexpr> at 0x...>
# 逐个获取值
for num in squares_gen:
print(num, end=' ')
# 0 1 4 9 16 25 36 49 64 81
列表的切片操作
切片是Python中非常强大的功能,可以用于复制列表、反转列表、获取子列表等。
my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# 基本切片
print(my_list[2:5]) # [2, 3, 4]
print(my_list[:3]) # [0, 1, 2]
print(my_list[6:]) # [6, 7, 8, 9]
print(my_list[::2]) # [0, 2, 4, 6, 8] 步长为2
# 反转列表
reversed_list = my_list[::-1]
print(reversed_list) # [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
# 复制列表
copied_list = my_list[:]
print(copied_list) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
列表的性能特性
列表在Python中是动态数组实现的,这意味着:
- 随机访问(通过索引)的时间复杂度是O(1)
- 在末尾添加元素(append)的平均时间复杂度是O(1)
- 在中间或开头插入元素(insert)的时间复杂度是O(n)
- 删除元素(remove, pop(i))的时间复杂度是O(n)
import time
# 比较append和insert的性能
def test_append(n):
lst = []
for i in range(n):
lst.append(i)
return lst
def test_insert(n):
lst = []
for i in range(n):
lst.insert(0, i) # 每次都在开头插入
return lst
n = 10000
start = time.time()
test_append(n)
print(f"Append took: {time.time() - start:.6f} seconds")
start = time.time()
test_insert(n)
print(f"Insert took: {time.time() - start:.6f} seconds")
字典(Dictionary)的高级应用
字典推导式
与列表推导式类似,字典推导式可以简洁地创建字典。
# 基本字典推导式
squares_dict = {x: x**2 for x in range(6)}
print(squares_dict) # {0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25}
# 交换键值
original = {'a': 1, 'b': 2, 'c': 3}
swapped = {v: k for k, v in original.items()}
print(swapped) # {1: 'a', 2: 'b', 3: 'c'}
# 过滤字典
prices = {'apple': 0.5, 'banana': 0.25, 'orange': 0.75, 'milk': 1.5}
expensive = {k: v for k, v in prices.items() if v >= 0.75}
print(expensive) # {'orange': 0.75, 'milk': 1.5}
defaultdict和OrderedDict
collections模块提供了字典的增强版本。
from collections import defaultdict, OrderedDict
# defaultdict: 当键不存在时返回默认值
dd = defaultdict(int) # 默认值为0
dd['count'] += 1
print(dd) # defaultdict(<class 'int'>, {'count': 1})
# 按字母统计单词出现次数
text = "apple banana apple cherry banana apple"
word_counts = defaultdict(int)
for word in text.split():
word_counts[word] += 1
print(dict(word_counts)) # {'apple': 3, 'banana': 2, 'cherry': 1}
# OrderedDict: 保持插入顺序
od = OrderedDict()
od['first'] = 1
od['second'] = 2
od['third'] = 3
print(od) # OrderedDict([('first', 1), ('second', 2), ('third', 3)])
# Python 3.7+ 普通字典也保持插入顺序
regular_dict = {'first': 1, 'second': 2, 'third': 3}
print(regular_dict) # {'first': 1, 'second': 2, 'third': 3}
字典的性能特性
字典在Python中是通过哈希表实现的:
- 查找、插入、删除的平均时间复杂度是O(1)
- 最坏情况是O(n),但很少发生
- 内存占用相对较大
import time
import random
# 测试字典查找性能
def test_dict_lookup(n):
d = {i: i**2 for i in range(n)}
start = time.time()
for _ in range(10000):
key = random.randint(0, n-1)
_ = d[key]
return time.time() - start
# 测试列表查找性能
def test_list_lookup(n):
lst = list(range(n))
start = time.time()
for _ in range(10000):
key = random.randint(0, n-1)
try:
_ = lst.index(key)
except ValueError:
pass
return time.time() - start
n = 10000
print(f"Dict lookup time: {test_dict_lookup(n):.6f}")
print(f"List lookup time: {test_list_lookup(n):.6f}")
集合(Set)的高级应用
集合运算
集合支持数学上的集合运算,非常适合用于数据去重和关系运算。
# 集合的基本操作
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# 并集
print(A | B) # {1, 2, 3, 4, 5, 6, 7, 8}
print(A.union(B)) # 同上
# 交集
print(A & B) # {4, 5}
print(A.intersection(B)) # 同上
# 差集
print(A - B) # {1, 2, 3}
print(A.difference(B)) # 同上
# 对称差集(只属于其中一个集合的元素)
print(A ^ B) # {1, 2, 3, 6, 7, 8}
print(A.symmetric_difference(B)) # 同上
# 判断子集和超集
C = {2, 3}
print(C <= A) # True, C是A的子集
print(A >= C) # True, A是C的超集
集合推导式
# 创建数字的平方集合
squares_set = {x**2 for x in range(-5, 6)}
print(squares_set) # {0, 1, 4, 9, 16, 25}
# 从列表中提取唯一元素
numbers = [1, 2, 2, 3, 4, 4, 5, 1, 2]
unique_numbers = set(numbers)
print(unique_numbers) # {1, 2, 3, 4, 5}
# 集合推导式与条件
even_squares = {x**2 for x in range(10) if x % 2 == 0}
print(even_squares) # {0, 4, 16, 36, 64}
集合的性能特性
集合与字典类似,也是基于哈希表实现:
- 添加、删除、查找的平均时间复杂度是O(1)
- 集合运算(如并集、交集)的时间复杂度取决于集合大小
import time
def test_set_operations(n):
# 创建两个大集合
set1 = set(range(0, n, 2)) # 偶数
set2 = set(range(1, n, 2)) # 奇数
# 测试并集性能
start = time.time()
union = set1 | set2
union_time = time.time() - start
# 测试交集性能
start = time.time()
intersection = set1 & set2
intersection_time = time.time() - start
return union_time, intersection_time
n = 100000
union_t, inter_t = test_set_operations(n)
print(f"Union of {n} elements took: {union_t:.6f} seconds")
print(f"Intersection of {n} elements took: {inter_t:.6f} seconds")
元组(Tuple)的高级应用
元组拆包和多变量赋值
元组在Python中常用于函数返回多个值和多变量赋值。
# 基本拆包
point = (3, 4)
x, y = point
print(f"x={x}, y={y}") # x=3, y=4
# 交换变量值
a, b = 1, 2
a, b = b, a
print(f"a={a}, b={b}") # a=2, b=1
# 函数返回多个值
def get_user_info():
return "Alice", 25, "alice@example.com"
name, age, email = get_user_info()
print(f"Name: {name}, Age: {age}, Email: {email}")
# 带星号的拆包
first, *middle, last = [1, 2, 3, 4, 5]
print(first) # 1
print(middle) # [2, 3, 4]
print(last) # 5
命名元组(namedtuple)
collections.namedtuple为元组元素提供名称,使代码更具可读性。
from collections import namedtuple
# 定义命名元组
Point = namedtuple('Point', ['x', 'y'])
p = Point(3, 4)
print(p.x, p.y) # 3 4
# 与普通元组比较
regular_tuple = (3, 4)
print(regular_tuple[0]) # 3
print(p[0]) # 3
print(p.x) # 3 (更具可读性)
# 在实际项目中的应用
Employee = namedtuple('Employee', ['id', 'name', 'department', 'salary'])
employees = [
Employee(1, 'Alice', 'Engineering', 90000),
Employee(2, 'Bob', 'Marketing', 75000),
Employee(3, 'Charlie', 'Engineering', 95000)
]
# 计算平均工资
avg_salary = sum(e.salary for e in employees) / len(employees)
print(f"Average salary: ${avg_salary:.2f}")
元组的性能特性
元组是不可变的,因此:
- 创建速度比列表快
- 可以作为字典的键(因为不可变)
- 内存占用比列表小
import time
import sys
# 比较创建速度
def test_tuple_creation(n):
start = time.time()
for i in range(n):
t = (i, i+1, i+2)
return time.time() - start
def test_list_creation(n):
start = time.time()
for i in range(n):
lst = [i, i+1, i+2]
return time.time() - start
n = 1000000
tuple_time = test_tuple_creation(n)
list_time = test_list_creation(n)
print(f"Tuple creation: {tuple_time:.6f} seconds")
print(f"List creation: {list_time:.6f} seconds")
# 比较内存占用
t = (1, 2, 3, 4, 5)
lst = [1, 2, 3, 4, 5]
print(f"Tuple size: {sys.getsizeof(t)} bytes")
print(f"List size: {sys.getsizeof(lst)} bytes")
实际项目中的应用案例
案例1:数据处理管道
from collections import defaultdict, namedtuple
from typing import List, Dict
# 定义数据结构
Record = namedtuple('Record', ['timestamp', 'user_id', 'action', 'value'])
# 模拟日志数据
logs = [
Record('2024-01-01 10:00', 'user1', 'login', 1),
Record('2024-01-01 10:05', 'user1', 'purchase', 50),
Record('2024-01-01 10:10', 'user2', 'login', 1),
Record('2024-01-01 10:15', 'user1', 'purchase', 30),
Record('2024-01-01 10:20', 'user2', 'purchase', 20),
]
def process_logs(logs: List[Record]) -> Dict:
"""处理日志数据,统计用户行为"""
# 使用defaultdict统计每个用户的总消费
user_spending = defaultdict(float)
# 使用集合记录活跃用户
active_users = set()
for log in logs:
active_users.add(log.user_id)
if log.action == 'purchase':
user_spending[log.user_id] += log.value
# 使用字典推导式计算平均消费
avg_spending = {
user: total / logs.count(user)
for user, total in user_spending.items()
}
return {
'total_users': len(active_users),
'total_revenue': sum(user_spending.values()),
'user_spending': dict(user_spending),
'avg_spending': avg_spending
}
result = process_logs(logs)
print("Processing Results:")
for key, value in result.items():
print(f" {key}: {value}")
案例2:图算法中的应用
from collections import defaultdict, deque
class Graph:
"""使用字典和集合实现图数据结构"""
def __init__(self):
self.adjacency = defaultdict(set) # 邻接表
def add_edge(self, u, v):
"""添加边"""
self.adjacency[u].add(v)
self.adjacency[v].add(u) # 无向图
def bfs(self, start):
"""广度优先搜索"""
visited = set()
queue = deque([start])
visited.add(start)
result = []
while queue:
node = queue.popleft()
result.append(node)
for neighbor in self.adjacency[node]:
if neighbor not in visited:
visited.add(neighbor)
queue.append(neighbor)
return result
def find_all_paths(self, start, end, path=None):
"""找到所有路径(DFS)"""
if path is None:
path = [start]
if start == end:
return [path]
paths = []
for node in self.adjacency[start] - set(path):
new_paths = self.find_all_paths(node, end, path + [node])
paths.extend(new_paths)
return paths
# 使用示例
g = Graph()
g.add_edge('A', 'B')
g.add_edge('A', 'C')
g.add_edge('B', 'D')
g.add_edge('C', 'D')
g.add_edge('D', 'E')
print("BFS from A:", g.bfs('A'))
print("All paths from A to E:", g.find_all_paths('A', 'E'))
案例3:缓存系统实现
from functools import wraps
from collections import OrderedDict
import time
class LRUCache:
"""使用OrderedDict实现LRU缓存"""
def __init__(self, capacity: int):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key: str) -> int:
if key not in self.cache:
return -1
# 将访问的元素移到末尾(最近使用)
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key: str, value: int) -> None:
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
# 弹出最久未使用的元素(第一个)
self.cache.popitem(last=False)
# 装饰器版本
def lru_cache(maxsize=128):
def decorator(func):
cache = OrderedDict()
@wraps(func)
def wrapper(*args, **kwargs):
key = str(args) + str(kwargs)
if key in cache:
cache.move_to_end(key)
return cache[key]
result = func(*args, **kwargs)
if len(cache) >= maxsize:
cache.popitem(last=False)
cache[key] = result
return result
return wrapper
return decorator
# 使用示例
@lru_cache(maxsize=3)
def expensive_computation(x):
time.sleep(0.1) # 模拟耗时计算
return x * x
print(expensive_computation(2)) # 计算并缓存
print(expensive_computation(2)) # 直接从缓存返回
print(expensive_computation(3)) # 计算并缓存
print(expensive_computation(4)) # 计算并缓存,缓存已满,淘汰最旧的
性能优化技巧
1. 选择合适的数据结构
# 检查元素是否存在:使用集合而不是列表
def check_performance():
large_list = list(range(100000))
large_set = set(large_list)
import time
# 列表查找(线性时间)
start = time.time()
_ = 99999 in large_list
list_time = time.time() - start
# 集合查找(常数时间)
start = time.time()
_ = 99999 in large_set
set_time = time.time() - start
print(f"List lookup: {list_time:.6f} seconds")
print(f"Set lookup: {set_time:.6f} seconds")
print(f"Speedup: {list_time/set_time:.2f}x")
check_performance()
2. 使用生成器处理大数据
# 传统方式:一次性加载所有数据到内存
def process_large_file_traditional(filename):
with open(filename) as f:
lines = f.readlines() # 所有行加载到内存
return [line.strip() for line in lines if len(line) > 10]
# 生成器方式:逐行处理
def process_large_file_generator(filename):
with open(filename) as f:
for line in f:
if len(line) > 10:
yield line.strip()
# 使用示例
# for processed_line in process_large_file_generator("large_file.txt"):
# print(processed_line)
3. 使用字典替代多个if-else
# 传统方式
def handle_action_traditional(action):
if action == 'create':
return "Creating..."
elif action == 'update':
return "Updating..."
elif action == 'delete':
return "Deleting..."
else:
return "Unknown action"
# 字典映射方式
def handle_action_dict(action):
actions = {
'create': lambda: "Creating...",
'update': lambda: "Updating...",
'delete': lambda: "Deleting..."
}
return actions.get(action, lambda: "Unknown action")()
# 更高级的字典映射(带参数)
def create_item(name):
return f"Creating {name}"
def update_item(name):
return f"Updating {name}"
def delete_item(name):
return f"Deleting {name}"
action_handlers = {
'create': create_item,
'update': update_item,
'delete': delete_item
}
def handle_item(action, name):
return action_handlers.get(action, lambda x: "Unknown action")(name)
print(handle_item('create', 'book')) # Creating book
总结
Python的高级数据结构是编写高效、优雅代码的基础。通过深入理解它们的内部实现和性能特性,我们可以在实际项目中做出更明智的选择:
- 列表:适合有序数据,动态数组实现,随机访问快,中间插入慢
- 字典:键值对存储,哈希表实现,查找速度快,适合快速检索
- 集合:唯一元素存储,哈希表实现,适合去重和集合运算
- 元组:不可变序列,创建快,内存小,适合作为字典键和函数返回值
在实际开发中,结合列表推导式、字典推导式、生成器表达式等高级特性,可以写出更加简洁、高效的代码。同时,根据具体场景选择合适的数据结构,是每个Python开发者应该掌握的重要技能。
