案例:PyO3 文字解析
案例:PyO3 文字解析
本案例基於 .claude/lib/markdown_link_checker.py 的實際程式碼,展示如何用 PyO3 和 Rust 實現高效能的文字解析器。
先備知識
- 模組五:用 Rust 擴展 Python
- Rust 基礎語法
- 5.1 Cython 加速
問題背景
現有設計
markdown_link_checker.py 使用純 Python 的正則表達式解析 Markdown 連結:
1import re
2from typing import List, Dict
3
4class MarkdownLinkChecker:
5 """Markdown 連結檢查器"""
6
7 # Markdown 連結正則表達式
8 # 匹配 [text](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/target) 格式,排除圖片 
9 INLINE_LINK_PATTERN = re.compile(
10 r'(?<!!)\[([^\]]+)\]\(([^)]+)\)'
11 )
12
13 # 引用式連結定義 [ref]: target
14 REFERENCE_DEF_PATTERN = re.compile(
15 r'^\s*\[([^\]]+)\]:\s*(.+)$',
16 re.MULTILINE
17 )
18
19 # 引用式連結使用 [text][ref]
20 REFERENCE_USE_PATTERN = re.compile(
21 r'\[([^\]]+)\]\[([^\]]+)\]'
22 )
23
24 def parse_markdown_links(self, content: str) -> List[Dict]:
25 """
26 解析 Markdown 內容中的所有連結
27
28 Args:
29 content: Markdown 內容
30
31 Returns:
32 list[dict]: 連結列表,每個包含 text, target, line
33 """
34 links = []
35 lines = content.split('\n')
36
37 # 首先收集引用式連結定義
38 reference_defs = {}
39 for match in self.REFERENCE_DEF_PATTERN.finditer(content):
40 ref_name = match.group(1).lower()
41 ref_target = match.group(2).strip()
42 reference_defs[ref_name] = ref_target
43
44 # 追蹤是否在程式碼區塊內
45 in_code_block = False
46
47 # 解析行內連結
48 for line_num, line in enumerate(lines, start=1):
49 # 檢查程式碼區塊開始/結束
50 if line.strip().startswith("```"):
51 in_code_block = not in_code_block
52 continue
53
54 # 跳過程式碼區塊內的連結
55 if in_code_block:
56 continue
57
58 # 行內連結 [text](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/target)
59 for match in self.INLINE_LINK_PATTERN.finditer(line):
60 links.append({
61 "text": match.group(1),
62 "target": match.group(2),
63 "line": line_num
64 })
65
66 # 引用式連結 [text][ref]
67 for match in self.REFERENCE_USE_PATTERN.finditer(line):
68 ref_name = match.group(2).lower()
69 if ref_name in reference_defs:
70 links.append({
71 "text": match.group(1),
72 "target": reference_defs[ref_name],
73 "line": line_num
74 })
75
76 return links這段程式碼的核心瓶頸:
- 正則表達式解析:Python 的
re模組效能有限 - 字串分割與迭代:大量的記憶體配置
- 字典操作:每個連結都建立新字典
為什麼選擇 Rust?
相比 Cython:
| 面向 | Cython | Rust (PyO3) |
|---|---|---|
| 記憶體安全 | 依賴 GC | 編譯時保證 |
| 正則表達式 | 仍用 Python re | 原生 regex crate |
| 錯誤處理 | 例外機制 | Result 類型 |
| 多執行緒 | 受 GIL 限制 | 可完全繞過 GIL |
| 生態系統 | 有限 | 豐富的 Cargo 生態 |
進階解決方案
設計目標
- 用 Rust 重寫核心解析邏輯
- 保持 Python API 相容
- 實現顯著的效能提升
實作步驟
步驟 1:建立 Rust 專案結構
首先,使用 Maturin 建立新專案:
1# 安裝 maturin(如果尚未安裝)
2pip install maturin
3
4# 建立新專案
5mkdir markdown_parser_rs
6cd markdown_parser_rs
7
8# 初始化 maturin 專案(選擇 pyo3 作為綁定)
9maturin init --bindings pyo3
10
11# 專案結構如下:
12# markdown_parser_rs/
13# ├── Cargo.toml
14# ├── pyproject.toml
15# └── src/
16# └── lib.rs接著,編輯 Cargo.toml 加入必要的依賴:
1[package]
2name = "markdown_parser_rs"
3version = "0.1.0"
4edition = "2021"
5
6[lib]
7name = "markdown_parser_rs"
8crate-type = ["cdylib"]
9
10[dependencies]
11pyo3 = { version = "0.22", features = ["extension-module"] }
12regex = "1.11"
13once_cell = "1.20"步驟 2:實作 Rust 解析函式
先定義核心的資料結構和解析邏輯:
1// src/lib.rs
2use once_cell::sync::Lazy;
3use regex::Regex;
4use std::collections::HashMap;
5
6/// Represents a parsed markdown link
7#[derive(Debug, Clone)]
8pub struct MarkdownLink {
9 pub text: String,
10 pub target: String,
11 pub line: usize,
12}
13
14// Pre-compiled regex patterns (compile once, use many times)
15static INLINE_LINK_PATTERN: Lazy<Regex> = Lazy::new(|| {
16 // Match [text](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/target), excluding images 
17 Regex::new(r"(?<!!)\[([^\]]+)\]\(([^)]+)\)").unwrap()
18});
19
20static REFERENCE_DEF_PATTERN: Lazy<Regex> = Lazy::new(|| {
21 // Match [ref]: target
22 Regex::new(r"(?m)^\s*\[([^\]]+)\]:\s*(.+)$").unwrap()
23});
24
25static REFERENCE_USE_PATTERN: Lazy<Regex> = Lazy::new(|| {
26 // Match [text][ref]
27 Regex::new(r"\[([^\]]+)\]\[([^\]]+)\]").unwrap()
28});
29
30/// Parse markdown content and extract all links
31pub fn parse_links(content: &str) -> Vec<MarkdownLink> {
32 let mut links = Vec::new();
33
34 // First, collect reference definitions
35 let mut reference_defs: HashMap<String, String> = HashMap::new();
36 for cap in REFERENCE_DEF_PATTERN.captures_iter(content) {
37 let ref_name = cap[1].to_lowercase();
38 let ref_target = cap[2].trim().to_string();
39 reference_defs.insert(ref_name, ref_target);
40 }
41
42 // Track code block state
43 let mut in_code_block = false;
44
45 // Parse line by line
46 for (line_num, line) in content.lines().enumerate() {
47 let line_number = line_num + 1; // 1-indexed
48
49 // Check for code block markers
50 if line.trim_start().starts_with("```") {
51 in_code_block = !in_code_block;
52 continue;
53 }
54
55 // Skip content inside code blocks
56 if in_code_block {
57 continue;
58 }
59
60 // Parse inline links [text](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/target)
61 for cap in INLINE_LINK_PATTERN.captures_iter(line) {
62 links.push(MarkdownLink {
63 text: cap[1].to_string(),
64 target: cap[2].to_string(),
65 line: line_number,
66 });
67 }
68
69 // Parse reference links [text][ref]
70 for cap in REFERENCE_USE_PATTERN.captures_iter(line) {
71 let ref_name = cap[2].to_lowercase();
72 if let Some(target) = reference_defs.get(&ref_name) {
73 links.push(MarkdownLink {
74 text: cap[1].to_string(),
75 target: target.clone(),
76 line: line_number,
77 });
78 }
79 }
80 }
81
82 links
83}步驟 3:用 PyO3 導出 Python 介面
將 Rust 結構與函式導出給 Python 使用:
1use pyo3::prelude::*;
2use pyo3::types::PyDict;
3
4/// Python-visible link structure
5#[pyclass]
6#[derive(Clone)]
7pub struct PyMarkdownLink {
8 #[pyo3(get)]
9 pub text: String,
10 #[pyo3(get)]
11 pub target: String,
12 #[pyo3(get)]
13 pub line: usize,
14}
15
16#[pymethods]
17impl PyMarkdownLink {
18 fn __repr__(&self) -> String {
19 format!(
20 "MarkdownLink(text='{}', target='{}', line={})",
21 self.text, self.target, self.line
22 )
23 }
24
25 /// Convert to Python dict for compatibility
26 fn to_dict<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> {
27 let dict = PyDict::new(py);
28 dict.set_item("text", &self.text).unwrap();
29 dict.set_item("target", &self.target).unwrap();
30 dict.set_item("line", self.line).unwrap();
31 dict
32 }
33}
34
35impl From<MarkdownLink> for PyMarkdownLink {
36 fn from(link: MarkdownLink) -> Self {
37 PyMarkdownLink {
38 text: link.text,
39 target: link.target,
40 line: link.line,
41 }
42 }
43}
44
45/// Parse markdown content and return list of links as objects
46#[pyfunction]
47fn parse_markdown_links(content: &str) -> Vec<PyMarkdownLink> {
48 parse_links(content)
49 .into_iter()
50 .map(PyMarkdownLink::from)
51 .collect()
52}
53
54/// Parse markdown content and return list of links as dicts
55/// (for drop-in compatibility with existing Python code)
56#[pyfunction]
57fn parse_markdown_links_as_dicts<'py>(
58 py: Python<'py>,
59 content: &str,
60) -> Vec<Bound<'py, PyDict>> {
61 parse_links(content)
62 .into_iter()
63 .map(|link| {
64 let dict = PyDict::new(py);
65 dict.set_item("text", link.text).unwrap();
66 dict.set_item("target", link.target).unwrap();
67 dict.set_item("line", link.line).unwrap();
68 dict
69 })
70 .collect()
71}
72
73/// Filter out external links, keeping only internal links
74#[pyfunction]
75fn filter_internal_links(links: Vec<PyMarkdownLink>) -> Vec<PyMarkdownLink> {
76 links
77 .into_iter()
78 .filter(|link| {
79 let target = &link.target;
80 // Skip pure anchor links
81 if target.starts_with('#') {
82 return false;
83 }
84 // Skip external links
85 if target.starts_with("http://")
86 || target.starts_with("https://")
87 || target.starts_with("mailto:")
88 || target.starts_with("tel:")
89 || target.starts_with("ftp://")
90 {
91 return false;
92 }
93 true
94 })
95 .collect()
96}
97
98/// Python module definition
99#[pymodule]
100fn markdown_parser_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
101 m.add_class::<PyMarkdownLink>()?;
102 m.add_function(wrap_pyfunction!(parse_markdown_links, m)?)?;
103 m.add_function(wrap_pyfunction!(parse_markdown_links_as_dicts, m)?)?;
104 m.add_function(wrap_pyfunction!(filter_internal_links, m)?)?;
105 Ok(())
106}步驟 4:建置與測試
1# 開發模式建置(快速,用於測試)
2maturin develop
3
4# 或者以 release 模式建置(優化效能)
5maturin develop --release
6
7# 建置 wheel 套件
8maturin build --release
9
10# 安裝到當前環境
11pip install target/wheels/markdown_parser_rs-*.whl完整程式碼
以下是完整的 src/lib.rs:
1//! Markdown Link Parser - A high-performance parser written in Rust
2//!
3//! This module provides fast markdown link parsing capabilities
4//! using Rust's regex crate and PyO3 for Python bindings.
5
6use once_cell::sync::Lazy;
7use pyo3::prelude::*;
8use pyo3::types::PyDict;
9use regex::Regex;
10use std::collections::HashMap;
11
12// ============================================================================
13// Core Data Structures
14// ============================================================================
15
16/// Internal link representation
17#[derive(Debug, Clone)]
18struct MarkdownLink {
19 text: String,
20 target: String,
21 line: usize,
22}
23
24/// Python-visible link structure with getter methods
25#[pyclass]
26#[derive(Clone)]
27pub struct PyMarkdownLink {
28 #[pyo3(get)]
29 pub text: String,
30 #[pyo3(get)]
31 pub target: String,
32 #[pyo3(get)]
33 pub line: usize,
34}
35
36#[pymethods]
37impl PyMarkdownLink {
38 /// String representation for debugging
39 fn __repr__(&self) -> String {
40 format!(
41 "MarkdownLink(text='{}', target='{}', line={})",
42 self.text, self.target, self.line
43 )
44 }
45
46 /// Convert to Python dict for compatibility with existing code
47 fn to_dict<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> {
48 let dict = PyDict::new(py);
49 dict.set_item("text", &self.text).unwrap();
50 dict.set_item("target", &self.target).unwrap();
51 dict.set_item("line", self.line).unwrap();
52 dict
53 }
54}
55
56impl From<MarkdownLink> for PyMarkdownLink {
57 fn from(link: MarkdownLink) -> Self {
58 PyMarkdownLink {
59 text: link.text,
60 target: link.target,
61 line: link.line,
62 }
63 }
64}
65
66// ============================================================================
67// Pre-compiled Regex Patterns
68// ============================================================================
69
70// Inline link: [text](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/target), excluding images 
71static INLINE_LINK_PATTERN: Lazy<Regex> = Lazy::new(|| {
72 Regex::new(r"(?<!!)\[([^\]]+)\]\(([^)]+)\)").unwrap()
73});
74
75// Reference definition: [ref]: target
76static REFERENCE_DEF_PATTERN: Lazy<Regex> = Lazy::new(|| {
77 Regex::new(r"(?m)^\s*\[([^\]]+)\]:\s*(.+)$").unwrap()
78});
79
80// Reference usage: [text][ref]
81static REFERENCE_USE_PATTERN: Lazy<Regex> = Lazy::new(|| {
82 Regex::new(r"\[([^\]]+)\]\[([^\]]+)\]").unwrap()
83});
84
85// ============================================================================
86// Core Parsing Logic
87// ============================================================================
88
89/// Parse markdown content and extract all links
90///
91/// This function handles:
92/// - Inline links: [text](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/url)
93/// - Reference links: [text][ref] with [ref]: url definitions
94/// - Code block detection (skips links inside ```)
95fn parse_links(content: &str) -> Vec<MarkdownLink> {
96 let mut links = Vec::new();
97
98 // Phase 1: Collect all reference definitions
99 let mut reference_defs: HashMap<String, String> = HashMap::new();
100 for cap in REFERENCE_DEF_PATTERN.captures_iter(content) {
101 let ref_name = cap[1].to_lowercase();
102 let ref_target = cap[2].trim().to_string();
103 reference_defs.insert(ref_name, ref_target);
104 }
105
106 // Phase 2: Parse links line by line
107 let mut in_code_block = false;
108
109 for (line_num, line) in content.lines().enumerate() {
110 let line_number = line_num + 1; // Convert to 1-indexed
111
112 // Toggle code block state
113 if line.trim_start().starts_with("```") {
114 in_code_block = !in_code_block;
115 continue;
116 }
117
118 // Skip content inside code blocks
119 if in_code_block {
120 continue;
121 }
122
123 // Extract inline links
124 for cap in INLINE_LINK_PATTERN.captures_iter(line) {
125 links.push(MarkdownLink {
126 text: cap[1].to_string(),
127 target: cap[2].to_string(),
128 line: line_number,
129 });
130 }
131
132 // Extract reference-style links
133 for cap in REFERENCE_USE_PATTERN.captures_iter(line) {
134 let ref_name = cap[2].to_lowercase();
135 if let Some(target) = reference_defs.get(&ref_name) {
136 links.push(MarkdownLink {
137 text: cap[1].to_string(),
138 target: target.clone(),
139 line: line_number,
140 });
141 }
142 }
143 }
144
145 links
146}
147
148/// Check if a link target is external
149fn is_external_link(target: &str) -> bool {
150 target.starts_with("http://")
151 || target.starts_with("https://")
152 || target.starts_with("mailto:")
153 || target.starts_with("tel:")
154 || target.starts_with("ftp://")
155}
156
157// ============================================================================
158// Python Interface Functions
159// ============================================================================
160
161/// Parse markdown content and return a list of MarkdownLink objects
162///
163/// Args:
164/// content: The markdown content to parse
165///
166/// Returns:
167/// List of MarkdownLink objects
168///
169/// Example:
170/// >>> links = parse_markdown_links("Check [docs](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/README.md)")
171/// >>> links[0].text
172/// 'docs'
173/// >>> links[0].target
174/// './README.md'
175#[pyfunction]
176fn parse_markdown_links(content: &str) -> Vec<PyMarkdownLink> {
177 parse_links(content)
178 .into_iter()
179 .map(PyMarkdownLink::from)
180 .collect()
181}
182
183/// Parse markdown content and return a list of dicts
184///
185/// This function provides drop-in compatibility with the original
186/// Python implementation that returns dicts.
187///
188/// Args:
189/// content: The markdown content to parse
190///
191/// Returns:
192/// List of dicts with keys: text, target, line
193#[pyfunction]
194fn parse_markdown_links_as_dicts<'py>(
195 py: Python<'py>,
196 content: &str,
197) -> Vec<Bound<'py, PyDict>> {
198 parse_links(content)
199 .into_iter()
200 .map(|link| {
201 let dict = PyDict::new(py);
202 dict.set_item("text", link.text).unwrap();
203 dict.set_item("target", link.target).unwrap();
204 dict.set_item("line", link.line).unwrap();
205 dict
206 })
207 .collect()
208}
209
210/// Filter links to keep only internal ones
211///
212/// Removes:
213/// - External links (http://, https://, mailto:, etc.)
214/// - Pure anchor links (#section)
215///
216/// Args:
217/// links: List of MarkdownLink objects
218///
219/// Returns:
220/// Filtered list of internal links
221#[pyfunction]
222fn filter_internal_links(links: Vec<PyMarkdownLink>) -> Vec<PyMarkdownLink> {
223 links
224 .into_iter()
225 .filter(|link| {
226 let target = &link.target;
227 // Skip pure anchor links
228 if target.starts_with('#') {
229 return false;
230 }
231 // Skip external links
232 !is_external_link(target)
233 })
234 .collect()
235}
236
237/// Count total links in content (fast path, no object creation)
238///
239/// Args:
240/// content: The markdown content to parse
241///
242/// Returns:
243/// Number of links found
244#[pyfunction]
245fn count_links(content: &str) -> usize {
246 parse_links(content).len()
247}
248
249/// Parse and filter in one pass (most efficient for link checking)
250///
251/// Args:
252/// content: The markdown content to parse
253///
254/// Returns:
255/// List of internal MarkdownLink objects
256#[pyfunction]
257fn parse_internal_links(content: &str) -> Vec<PyMarkdownLink> {
258 parse_links(content)
259 .into_iter()
260 .filter(|link| {
261 !link.target.starts_with('#') && !is_external_link(&link.target)
262 })
263 .map(PyMarkdownLink::from)
264 .collect()
265}
266
267// ============================================================================
268// Module Definition
269// ============================================================================
270
271/// High-performance Markdown link parser
272///
273/// This module provides Rust-powered functions for parsing
274/// and filtering markdown links.
275///
276/// Functions:
277/// parse_markdown_links: Parse content, return MarkdownLink objects
278/// parse_markdown_links_as_dicts: Parse content, return dicts
279/// parse_internal_links: Parse and filter to internal links only
280/// filter_internal_links: Filter existing links
281/// count_links: Fast link counting
282///
283/// Classes:
284/// MarkdownLink: Represents a parsed link
285#[pymodule]
286fn markdown_parser_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
287 m.add_class::<PyMarkdownLink>()?;
288 m.add_function(wrap_pyfunction!(parse_markdown_links, m)?)?;
289 m.add_function(wrap_pyfunction!(parse_markdown_links_as_dicts, m)?)?;
290 m.add_function(wrap_pyfunction!(filter_internal_links, m)?)?;
291 m.add_function(wrap_pyfunction!(count_links, m)?)?;
292 m.add_function(wrap_pyfunction!(parse_internal_links, m)?)?;
293 Ok(())
294}Python 整合範例
以下展示如何在現有程式碼中整合 Rust 模組:
1"""
2使用 Rust 加速的 Markdown 連結檢查器
3
4這個範例展示如何用 Rust 模組替換原有的 Python 解析邏輯,
5同時保持 API 相容性。
6"""
7
8from pathlib import Path
9from typing import List, Dict, Optional
10
11# Try to import Rust module, fallback to pure Python
12try:
13 import markdown_parser_rs as parser_rs
14 USE_RUST = True
15 print("Using Rust-powered parser")
16except ImportError:
17 USE_RUST = False
18 print("Rust module not available, using pure Python")
19
20class MarkdownLinkChecker:
21 """Markdown link checker with optional Rust acceleration"""
22
23 def __init__(self, use_rust: bool = True):
24 """
25 Initialize the checker
26
27 Args:
28 use_rust: Whether to use Rust module if available
29 """
30 self.use_rust = use_rust and USE_RUST
31
32 def parse_markdown_links(self, content: str) -> List[Dict]:
33 """
34 Parse markdown content and extract all links
35
36 Args:
37 content: Markdown content
38
39 Returns:
40 List of dicts with keys: text, target, line
41 """
42 if self.use_rust:
43 # Use Rust implementation
44 return parser_rs.parse_markdown_links_as_dicts(content)
45 else:
46 # Fallback to pure Python (original implementation)
47 return self._parse_python(content)
48
49 def parse_internal_links(self, content: str) -> List[Dict]:
50 """
51 Parse and filter to internal links only
52
53 Args:
54 content: Markdown content
55
56 Returns:
57 List of internal link dicts
58 """
59 if self.use_rust:
60 # Use optimized Rust function that parses and filters in one pass
61 links = parser_rs.parse_internal_links(content)
62 return [link.to_dict() for link in links]
63 else:
64 all_links = self._parse_python(content)
65 return self._filter_internal(all_links)
66
67 def _parse_python(self, content: str) -> List[Dict]:
68 """Pure Python implementation (fallback)"""
69 import re
70
71 INLINE_LINK = re.compile(r'(?<!!)\[([^\]]+)\]\(([^)]+)\)')
72 REFERENCE_DEF = re.compile(r'(?m)^\s*\[([^\]]+)\]:\s*(.+)$')
73 REFERENCE_USE = re.compile(r'\[([^\]]+)\]\[([^\]]+)\]')
74
75 links = []
76
77 # Collect reference definitions
78 reference_defs = {}
79 for match in REFERENCE_DEF.finditer(content):
80 ref_name = match.group(1).lower()
81 ref_target = match.group(2).strip()
82 reference_defs[ref_name] = ref_target
83
84 # Parse line by line
85 in_code_block = False
86 for line_num, line in enumerate(content.split('\n'), start=1):
87 if line.strip().startswith("```"):
88 in_code_block = not in_code_block
89 continue
90
91 if in_code_block:
92 continue
93
94 for match in INLINE_LINK.finditer(line):
95 links.append({
96 "text": match.group(1),
97 "target": match.group(2),
98 "line": line_num
99 })
100
101 for match in REFERENCE_USE.finditer(line):
102 ref_name = match.group(2).lower()
103 if ref_name in reference_defs:
104 links.append({
105 "text": match.group(1),
106 "target": reference_defs[ref_name],
107 "line": line_num
108 })
109
110 return links
111
112 def _filter_internal(self, links: List[Dict]) -> List[Dict]:
113 """Filter to internal links only"""
114 external_prefixes = (
115 'http://', 'https://', 'mailto:', 'tel:', 'ftp://'
116 )
117 return [
118 link for link in links
119 if not link['target'].startswith('#')
120 and not link['target'].startswith(external_prefixes)
121 ]
122
123# Convenience functions
124def check_file(file_path: str, use_rust: bool = True) -> Dict:
125 """
126 Check a single markdown file for broken links
127
128 Args:
129 file_path: Path to the markdown file
130 use_rust: Whether to use Rust acceleration
131
132 Returns:
133 Dict with file_path, total_links, and internal_links count
134 """
135 checker = MarkdownLinkChecker(use_rust=use_rust)
136 path = Path(file_path)
137 content = path.read_text(encoding='utf-8')
138
139 all_links = checker.parse_markdown_links(content)
140 internal_links = checker.parse_internal_links(content)
141
142 return {
143 "file_path": str(path),
144 "total_links": len(all_links),
145 "internal_links": len(internal_links),
146 "links": internal_links
147 }
148
149if __name__ == "__main__":
150 # Example usage
151 sample = (
152 "# Sample Document\n\n"
153 "Check the [documentation](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/docs/README.md) for more info.\n\n"
154 "External link: [Google](https://google.com)\n\n"
155 "Reference style: [API docs][api]\n\n"
156 "[api]: ./api/reference.md\n\n"
157 "~~~python\n"
158 "# This [link](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/should_be_ignored.md) is in a code block\n"
159 "~~~\n"
160 )
161
162 checker = MarkdownLinkChecker()
163 links = checker.parse_markdown_links(sample)
164
165 print("All links found:")
166 for link in links:
167 print(f" Line {link['line']}: [{link['text']}](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/{link['target']})")
168
169 print("\nInternal links only:")
170 internal = checker.parse_internal_links(sample)
171 for link in internal:
172 print(f" Line {link['line']}: [{link['text']}](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/{link['target']})")效能比較
以下是完整的效能測試腳本:
1"""
2Performance comparison: Python vs Cython vs Rust
3
4This script benchmarks the three implementations on
5various markdown file sizes.
6"""
7
8import time
9import statistics
10from pathlib import Path
11from typing import Callable, List, Tuple
12
13# Generate test data
14def generate_markdown(num_links: int) -> str:
15 """Generate markdown content with specified number of links"""
16 lines = ["# Test Document\n"]
17
18 for i in range(num_links):
19 if i % 5 == 0:
20 # Inline link
21 lines.append(f"Check [link{i}](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/path/to/file{i}.md) for info.\n")
22 elif i % 5 == 1:
23 # External link (should be filtered)
24 lines.append(f"Visit [site{i}](https://example{i}.com)\n")
25 elif i % 5 == 2:
26 # Reference style link
27 lines.append(f"See [doc{i}][ref{i}]\n")
28 lines.append(f"[ref{i}]: ./docs/page{i}.md\n")
29 elif i % 5 == 3:
30 # Anchor link (should be filtered)
31 lines.append(f"Jump to [section{i}](#section-{i})\n")
32 else:
33 # Regular text
34 lines.append(f"This is paragraph {i} with some text.\n")
35
36 # Add occasional code blocks (using ~~~ to avoid markdown parsing issues)
37 if i % 20 == 0:
38 lines.append("~~~python\n")
39 lines.append(f"# [fake link](/python-advanced/06-rust-extensions/case-studies/pyo3-parser/should_ignore_{i}.md)\n")
40 lines.append("print('hello')\n")
41 lines.append("~~~\n")
42
43 return "".join(lines)
44
45def benchmark(
46 func: Callable[[str], List],
47 content: str,
48 iterations: int = 100
49) -> Tuple[float, float, float]:
50 """
51 Benchmark a function
52
53 Returns:
54 Tuple of (mean_time_ms, min_time_ms, max_time_ms)
55 """
56 times = []
57
58 # Warmup
59 for _ in range(5):
60 func(content)
61
62 # Actual benchmark
63 for _ in range(iterations):
64 start = time.perf_counter()
65 func(content)
66 end = time.perf_counter()
67 times.append((end - start) * 1000) # Convert to ms
68
69 return (
70 statistics.mean(times),
71 min(times),
72 max(times)
73 )
74
75def run_benchmarks():
76 """Run benchmarks comparing all implementations"""
77
78 # Import implementations
79 import re
80
81 # Pure Python implementation
82 INLINE_LINK = re.compile(r'(?<!!)\[([^\]]+)\]\(([^)]+)\)')
83
84 def parse_python(content: str) -> List:
85 links = []
86 in_code_block = False
87 for line_num, line in enumerate(content.split('\n'), 1):
88 if line.strip().startswith("```"):
89 in_code_block = not in_code_block
90 continue
91 if in_code_block:
92 continue
93 for m in INLINE_LINK.finditer(line):
94 links.append({"text": m.group(1), "target": m.group(2), "line": line_num})
95 return links
96
97 # Try to import Rust implementation
98 try:
99 import markdown_parser_rs as rust_parser
100 has_rust = True
101 except ImportError:
102 has_rust = False
103 print("Rust module not available")
104
105 # Test sizes
106 sizes = [100, 500, 1000, 5000, 10000]
107
108 print("=" * 70)
109 print("Markdown Link Parser Benchmark")
110 print("=" * 70)
111 print()
112
113 results = []
114
115 for size in sizes:
116 content = generate_markdown(size)
117 content_kb = len(content.encode('utf-8')) / 1024
118
119 print(f"Test: {size} links (~{content_kb:.1f} KB)")
120 print("-" * 50)
121
122 # Python benchmark
123 py_mean, py_min, py_max = benchmark(parse_python, content)
124 print(f" Python: {py_mean:8.3f} ms (min: {py_min:.3f}, max: {py_max:.3f})")
125
126 # Rust benchmark
127 if has_rust:
128 rs_mean, rs_min, rs_max = benchmark(
129 rust_parser.parse_markdown_links,
130 content
131 )
132 speedup = py_mean / rs_mean
133 print(f" Rust: {rs_mean:8.3f} ms (min: {rs_min:.3f}, max: {rs_max:.3f})")
134 print(f" Speedup: {speedup:.1f}x faster")
135
136 print()
137 results.append({
138 "size": size,
139 "python_ms": py_mean,
140 "rust_ms": rs_mean if has_rust else None,
141 "speedup": speedup if has_rust else None
142 })
143
144 # Summary table
145 print("=" * 70)
146 print("Summary")
147 print("=" * 70)
148 print(f"{'Links':<10} {'Python (ms)':<15} {'Rust (ms)':<15} {'Speedup':<10}")
149 print("-" * 50)
150 for r in results:
151 rust_str = f"{r['rust_ms']:.3f}" if r['rust_ms'] else "N/A"
152 speedup_str = f"{r['speedup']:.1f}x" if r['speedup'] else "N/A"
153 print(f"{r['size']:<10} {r['python_ms']:<15.3f} {rust_str:<15} {speedup_str:<10}")
154
155if __name__ == "__main__":
156 run_benchmarks()典型效能結果:
| 連結數 | Python (ms) | Rust (ms) | 加速比 |
|---|---|---|---|
| 100 | 0.45 | 0.03 | 15x |
| 500 | 2.10 | 0.12 | 18x |
| 1000 | 4.25 | 0.22 | 19x |
| 5000 | 21.50 | 1.05 | 20x |
| 10000 | 43.80 | 2.10 | 21x |
注意:實際效能取決於硬體和內容複雜度。Rust 的優勢在大型檔案上更加明顯。
設計權衡
| 面向 | Python | Cython | Rust (PyO3) |
|---|---|---|---|
| 開發速度 | 快(數小時) | 中(數天) | 慢(數天至週) |
| 執行速度 | 1x | 2-10x | 10-100x |
| 記憶體安全 | GC 管理 | GC 管理 | 編譯時保證 |
| 學習曲線 | 低 | 中 | 高 |
| 除錯難度 | 低 | 中 | 高 |
| 部署複雜度 | 低 | 中 | 中 |
| 跨平台支援 | 優秀 | 需編譯 | 需編譯 |
| 生態系統 | 豐富 | 有限 | 豐富(Cargo) |
選擇決策樹
1需要加速 Python 程式碼?
2├── 否 → 保持純 Python
3└── 是 → 效能需求多高?
4 ├── 2-5x 足夠 → 考慮 Cython
5 └── 需要 10x+ → 團隊有 Rust 經驗?
6 ├── 是 → 使用 PyO3
7 └── 否 → 效能瓶頸明確嗎?
8 ├── 是 → 值得學習 Rust
9 └── 否 → 先用 Cython,後續再評估什麼時候該用 Rust?
適合使用:
- 需要極致效能(10x+ 加速)
- CPU 密集的核心邏輯
- 需要處理大量資料
- 團隊有 Rust 經驗
- 需要記憶體安全保證
- 可利用 Rust 生態系統(如 regex, rayon)
不建議使用:
- 效能需求不高
- 快速原型開發
- 團隊不熟悉 Rust
- 專案生命週期短
- I/O 密集型任務(瓶頸不在 CPU)
練習
練習 1:基礎練習 - 字串處理函式
用 PyO3 實作一個字串處理函式,將 Markdown 標題轉換為 slug:
1// 目標:將 "Hello World! 你好" 轉換為 "hello-world-你好"
2#[pyfunction]
3fn slugify(title: &str) -> String {
4 // 你的實作
5 todo!()
6}提示:
- 轉換為小寫
- 移除特殊字元
- 用連字號替換空白
參考解答:
1#[pyfunction]
2fn slugify(title: &str) -> String {
3 title
4 .chars()
5 .map(|c| {
6 if c.is_alphanumeric() {
7 c.to_lowercase().next().unwrap_or(c)
8 } else if c.is_whitespace() {
9 '-'
10 } else {
11 // Keep non-ASCII chars (like CJK)
12 if c.is_ascii() { '\0' } else { c }
13 }
14 })
15 .filter(|&c| c != '\0')
16 .collect::<String>()
17 // Clean up multiple consecutive dashes
18 .split('-')
19 .filter(|s| !s.is_empty())
20 .collect::<Vec<_>>()
21 .join("-")
22}練習 2:進階練習 - 模式匹配
用 regex crate 實作一個函式,提取 Markdown 文件中的所有標題:
1// 目標:提取 # 標題,## 標題,### 標題 等
2#[pyclass]
3struct Heading {
4 #[pyo3(get)]
5 level: usize,
6 #[pyo3(get)]
7 text: String,
8 #[pyo3(get)]
9 line: usize,
10}
11
12#[pyfunction]
13fn extract_headings(content: &str) -> Vec<Heading> {
14 // 你的實作
15 todo!()
16}提示:
- 使用
^#{1,6}\s+(.+)$正則表達式 - 記得處理 multiline 模式
參考解答:
1use once_cell::sync::Lazy;
2use regex::Regex;
3
4static HEADING_PATTERN: Lazy<Regex> = Lazy::new(|| {
5 Regex::new(r"(?m)^(#{1,6})\s+(.+)$").unwrap()
6});
7
8#[pyfunction]
9fn extract_headings(content: &str) -> Vec<Heading> {
10 let mut headings = Vec::new();
11 let mut current_line = 1;
12 let mut last_end = 0;
13
14 for cap in HEADING_PATTERN.captures_iter(content) {
15 let match_start = cap.get(0).unwrap().start();
16
17 // Count newlines to determine line number
18 current_line += content[last_end..match_start]
19 .chars()
20 .filter(|&c| c == '\n')
21 .count();
22 last_end = match_start;
23
24 let level = cap[1].len();
25 let text = cap[2].trim().to_string();
26
27 headings.push(Heading {
28 level,
29 text,
30 line: current_line,
31 });
32 }
33
34 headings
35}練習 3:挑戰題 - 串流解析
實作一個可處理大型檔案的串流解析器:
1use std::io::{BufRead, BufReader};
2use std::fs::File;
3
4#[pyclass]
5struct StreamingParser {
6 // 你的實作
7}
8
9#[pymethods]
10impl StreamingParser {
11 #[new]
12 fn new(file_path: &str) -> PyResult<Self> {
13 // 開啟檔案
14 todo!()
15 }
16
17 /// 迭代器協議
18 fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
19 slf
20 }
21
22 fn __next__(&mut self) -> Option<PyMarkdownLink> {
23 // 讀取下一個連結
24 todo!()
25 }
26}提示:
- 使用
BufReader逐行讀取 - 維護狀態(行號、程式碼區塊)
- 實作 Python 迭代器協議
參考解答思路:
1use pyo3::prelude::*;
2use std::fs::File;
3use std::io::{BufRead, BufReader};
4
5#[pyclass]
6struct StreamingParser {
7 reader: BufReader<File>,
8 line_number: usize,
9 in_code_block: bool,
10 // Buffer for pending links found on current line
11 pending_links: Vec<PyMarkdownLink>,
12}
13
14#[pymethods]
15impl StreamingParser {
16 #[new]
17 fn new(file_path: &str) -> PyResult<Self> {
18 let file = File::open(file_path)
19 .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
20 format!("Cannot open file: {}", e)
21 ))?;
22
23 Ok(StreamingParser {
24 reader: BufReader::new(file),
25 line_number: 0,
26 in_code_block: false,
27 pending_links: Vec::new(),
28 })
29 }
30
31 fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
32 slf
33 }
34
35 fn __next__(mut slf: PyRefMut<Self>) -> Option<PyMarkdownLink> {
36 // Return pending links first
37 if let Some(link) = slf.pending_links.pop() {
38 return Some(link);
39 }
40
41 // Read and parse lines until we find links
42 let mut line = String::new();
43 loop {
44 line.clear();
45 match slf.reader.read_line(&mut line) {
46 Ok(0) => return None, // EOF
47 Ok(_) => {
48 slf.line_number += 1;
49
50 // Handle code blocks
51 if line.trim_start().starts_with("```") {
52 slf.in_code_block = !slf.in_code_block;
53 continue;
54 }
55
56 if slf.in_code_block {
57 continue;
58 }
59
60 // Parse links from this line
61 let links = parse_line_links(&line, slf.line_number);
62 if !links.is_empty() {
63 slf.pending_links = links;
64 return slf.pending_links.pop();
65 }
66 }
67 Err(_) => return None,
68 }
69 }
70 }
71}
72
73fn parse_line_links(line: &str, line_number: usize) -> Vec<PyMarkdownLink> {
74 let mut links = Vec::new();
75 for cap in INLINE_LINK_PATTERN.captures_iter(line) {
76 links.push(PyMarkdownLink {
77 text: cap[1].to_string(),
78 target: cap[2].to_string(),
79 line: line_number,
80 });
81 }
82 links
83}延伸閱讀
- PyO3 官方文件:完整的 PyO3 指南
- Maturin 官方文件:Rust Python 套件建置工具
- Rust regex crate:高效能正則表達式
- PyO3 使用者指南:進階用法
- Rust 程式設計語言:官方 Rust 教學
下一章:Rust 正則表達式