Day39 - 正则表达式基础

详细讲解

1. 正则表达式简介

正则表达式是一种用于匹配字符串的模式表达式，在文本处理、数据提取、验证等场景中广泛应用。

import re

# 最简单的匹配
pattern = r'hello'
text = 'hello world'

if re.search(pattern, text):
    print("找到匹配！")

# 常用场景
# 1. 验证邮箱格式
# 2. 提取网页中的链接
# 3. 处理日志文件
# 4. 数据清洗和转换

2. re 模块主要函数

2.1 `re.match()` - 从开头匹配

import re

# match() 从字符串开头匹配
result = re.match(r'hello', 'hello world')
print(result)  # <re.Match object>
print(result.group())  # 'hello'
print(result.span())   # (0, 5)

# 如果开头不匹配，返回 None
result = re.match(r'world', 'hello world')
print(result)  # None

# 使用分组
result = re.match(r'(\w+), (\w+)', 'hello, world')
if result:
    print(result.group(1))  # 'hello'
    print(result.group(2))  # 'world'
    print(result.groups())  # ('hello', 'world')

2.2 `re.search()` - 搜索匹配

# search() 在字符串任意位置查找第一个匹配
result = re.search(r'world', 'hello world')
print(result.group())  # 'world'
print(result.span())   # (6, 11)

# 搜索邮箱
text = "联系邮箱: test@example.com 或 admin@test.org"
result = re.search(r'\w+@\w+\.\w+', text)
print(result.group())  # 'test@example.com'

# 找不到返回 None
result = re.search(r'notfound', 'hello')
print(result)  # None

2.3 `re.findall()` - 查找所有匹配

# findall() 返回所有匹配的列表
text = "我的邮箱是 test@example.com，副邮箱是 backup@example.org"

# 查找所有邮箱
emails = re.findall(r'\w+@\w+\.\w+', text)
print(emails)  # ['test@example.com', 'backup@example.org']

# 查找所有数字
numbers = re.findall(r'\d+', '我有 3 个苹果，5 个橙子，共 8 个水果')
print(numbers)  # ['3', '5', '8']

# 使用分组 - 返回分组内容的列表
text = "红色: #FF0000, 绿色: #00FF00, 蓝色: #0000FF"
colors = re.findall(r'#([0-9A-Fa-f]{6})', text)
print(colors)  # ['FF0000', '00FF00', '0000FF']

2.4 `re.finditer()` - 迭代器版本

# finditer() 返回迭代器，比 findall() 内存效率高
text = "邮箱: a@test.com, b@test.com, c@test.com"

for match in re.finditer(r'\w+@\w+\.\w+', text):
    print(f"位置 {match.span()}: {match.group()}")

3. 字符类（Character Classes）

# [] - 匹配方括号内的任意一个字符

# 匹配元音字母
vowels = re.findall(r'[aeiouAEIOU]', 'Hello World')
print(vowels)  # ['e', 'o', 'o']

# 匹配数字或字母
alphanum = re.findall(r'[a-zA-Z0-9]', 'Hello123!')
print(alphanum)  # ['H', 'e', 'l', 'l', 'o', '1', '2', '3']

# [^] - 否定，匹配不在方括号内的字符
non_vowels = re.findall(r'[^aeiouAEIOU]', 'Hello')
print(non_vowels)  # ['H', 'l', 'l']

# 常用简写
# \d  = [0-9]        数字
# \D  = [^0-9]       非数字
# \w  = [a-zA-Z0-9_] 单词字符
# \W  = [^a-zA-Z0-9_] 非单词字符
# \s  = [ \t\n\r\f\v] 空白字符
# \S  = [^ \t\n\r\f\v] 非空白字符
# .   = 任意字符（除换行符）

# 示例
text = "用户: ZhangSan, 年龄: 25, 电话: 138-0000-8888"

ages = re.findall(r'\d+', text)
print(ages)  # ['25', '138', '0000', '8888']

words = re.findall(r'\w+', text)
print(words)  # ['用户', 'ZhangSan', '年龄', '25', '电话', '138', '0000', '8888']

4. 量词（Quantifiers）

# *    - 0 次或多次
# +    - 1 次或多次
# ?    - 0 次或 1 次
# {n}  - 恰好 n 次
# {n,} - 至少 n 次
# {n,m} - n 到 m 次

# * 示例：匹配任意数量字符
pattern = r'ab*c'  # ac, abc, abbc, abbbc, ...
print(re.findall(r'ab*c', 'ac abc abbc abbbc'))  # ['ac', 'abc', 'abbc', 'abbbc']

# + 示例：至少一个
pattern = r'ab+c'  # abc, abbc, abbbc, ... (不包括 ac)
print(re.findall(r'ab+c', 'ac abc abbc'))  # ['abc', 'abbc']

# ? 示例：可选
pattern = r'colou?r'  # color, colour
print(re.findall(r'colou?r', 'color colour'))  # ['color', 'colour']

# {n} 示例：精确次数
pattern = r'\d{4}'  # 4 位数字
print(re.findall(r'\d{4}', '2024年 12345月'))  # ['2024', '1234']

# {n,m} 示例：范围次数
pattern = r'\d{2,4}'  # 2-4 位数字
print(re.findall(r'\d{2,4}', '1 12 123 1234 12345'))
# ['12', '123', '1234', '1234', '5'] - 注意贪婪匹配

# 贪婪 vs 非贪婪（最小匹配）
text = '<div>content</div>'

# 贪婪：尽可能多匹配
print(re.findall(r'<div>.*</div>', text))  # ['<div>content</div>']

# 非贪婪：尽可能少匹配
print(re.findall(r'<div>.*?</div>', text))  # ['<div>content</div>']

# 非贪婪添加 ?
text = '<div>div1</div><div>div2</div>'
print(re.findall(r'<div>.*?</div>', text))  # ['<div>div1</div>', '<div>div2</div>']

5. 边界匹配

# ^  - 字符串开头
# $  - 字符串结尾
# \b - 单词边界
# \B - 非单词边界

# 匹配以数字开头的行
text = "123abc\nhello\n456xyz"
lines = re.findall(r'^\d+', text, re.MULTILINE)
print(lines)  # ['123', '456']

# 匹配以.com结尾的域名
domains = re.findall(r'\w+\.com$', 'test.com\nexample.org\nmail.com')
print(domains)  # ['test.com', 'mail.com']

# \b 边界示例
text = "a cat a the in cat"

# 匹配完整的单词 'cat'
cats = re.findall(r'\bcat\b', text)
print(cats)  # ['cat', 'cat'] - 两个独立的 cat

# 不使用边界
cats_all = re.findall(r'cat', text)
print(cats_all)  # ['cat', 'cat', 'cat'] - 包括 a[cat] 中的 cat

# \B 非边界示例
text = "abc abc"
print(re.findall(r'ab\B', text))  # ['ab'] - abc 中的 ab
print(re.findall(r'\Bab\B', text))  # [] - 开头和结尾的 ab 都不匹配

6. 分组（Groups）

# () - 创建分组

# 提取邮箱用户名和域名
text = "邮箱: john@example.com 和 jane@test.org"
matches = re.findall(r'(\w+)@(\w+)\.(\w+)', text)
print(matches)  # [('john', 'example', 'com'), ('jane', 'test', 'org')]

# 嵌套分组
text = "2024-05-01"
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', text)
if match:
    print(match.groups())  # ('2024', '05', '01')
    print(match.group(1))  # '2024' 年
    print(match.group(2))  # '05' 月
    print(match.group(3))  # '01' 日

# 命名分组
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, '2024-05-01')
if match:
    print(match.group('year'))   # '2024'
    print(match.group('month'))  # '05'
    print(match.group('day'))     # '01'
    print(match.groupdict())     # {'year': '2024', 'month': '05', 'day': '01'}

# 非捕获分组 (?:)
text = "abc123def"
# 捕获分组会作为 groups 返回
match = re.search(r'(?:abc)(\d+)', text)
print(match.groups())  # ('123',)

# 分组在替换中的应用
text = "2024-05-01"
result = re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\2/\3/\1', text)
print(result)  # 05/01/2024

7. re.compile() 编译正则

# 编译正则表达式，提高匹配效率
pattern = re.compile(r'\d{4}-\d{2}-\d{2}')

# 使用编译后的 pattern
text = "今天是 2024-05-01，明天是 2024-05-02"

print(pattern.findall(text))  # ['2024-05-01', '2024-05-02']
print(pattern.search(text).group())  # '2024-05-01'

# 编译时指定 flags
pattern = re.compile(r'hello', re.IGNORECASE)
print(pattern.findall('Hello HELLO hello'))  # ['Hello', 'HELLO', 'hello']

8. re 模块的 flags

# re.IGNORECASE / re.I - 忽略大小写
print(re.findall(r'hello', 'Hello HELLO', re.I))

# re.MULTILINE / re.M - 多行模式
text = "line1\nline2\nline3"
print(re.findall(r'^line\d', text, re.M))  # ['line1', 'line2', 'line3']

# re.DOTALL / re.S - 点号匹配所有字符（包括换行）
text = "hello\nworld"
print(re.findall(r'hello.*world', text))    # [] - 默认不匹配换行
print(re.findall(r'hello.*world', text, re.S))  # ['hello\nworld']

# re.VERBOSE / re.X - 详细模式（忽略空白和注释）
pattern = re.compile(r'''
    \d{3}  # 区号
    -      # 分隔符
    \d{4}  # 号码
''', re.VERBOSE)

# 可以组合使用
pattern = re.compile(r'hello', re.I | re.M)

背诵版

核心速查

┌─────────────────────────────────────────────────────────────┐ │ re 模块主要函数 │ ├─────────────────────────────────────────────────────────────┤ │ re.match() │ 从字符串开头匹配 │ │ re.search() │ 搜索第一个匹配 │ │ re.findall() │ 返回所有匹配的列表 │ │ re.finditer() │ 返回匹配的迭代器 │ │ re.sub() │ 替换匹配项 │ │ re.split() │ 按匹配分割 │ │ re.compile() │ 编译正则表达式 │ └─────────────────────────────────────────────────────────────┘

字符类速查

模式	说明	等价
`\d`	数字	`[0-9]`
`\D`	非数字	`[^0-9]`
`\w`	单词字符	`[a-zA-Z0-9_]`
`\W`	非单词字符	`[^a-zA-Z0-9_]`
`\s`	空白字符	`[ \t\n\r\f\v]`
`\S`	非空白字符	`[^ \t\n\r\f\v]`
`.`	任意字符	-

量词速查

量词	说明
`*`	0 次或多次
`+`	1 次或多次
`?`	0 次或 1 次
`{n}`	恰好 n 次
`{n,}`	至少 n 次
`{n,m}`	n 到 m 次

考前记忆

面试重点

match vs search
- match 从字符串开头匹配
- search 在字符串任意位置匹配
findall vs finditer
- findall 返回列表，内存占用高
- finditer 返回迭代器，内存高效
贪婪 vs 非贪婪
- 贪婪：.* 尽可能多匹配
- 非贪婪：.*? 尽可能少匹配
字符类简写
- \d 数字, \w 单词, \s 空白
分组编号
- group(0) 或 group() 是整个匹配
- group(1) 是第一个分组，以此类推

记忆口诀

match 从头搜，search 任意找。 findall 找全部，finditer 迭代器。量词控制次数，字符类定范围。贪婪非贪婪，区别在 * 和 +。

测试题

选择题

1. re.match() 和 re.search() 的区别是什么？

# A. 没有区别
# B. match 从开头匹配，search 搜索任意位置
# C. search 从开头匹配，match 搜索任意位置
# D. match 返回列表，search 返回单个

答案：B

2. 以下哪个量词表示"0次或1次"？

# A. *
# B. +
# C. ?
# D. {1}

答案：C

3. 哪个字符类匹配数字？

# A. \w
# B. \s
# C. \d
# D. \D

答案：C

4. 以下代码的输出是什么？

print(re.findall(r'\d+', 'a1b2c3'))

# A. ['1', '2', '3']
# B. '123'
# C. [1, 2, 3]
# D. ['a1', 'b2', 'c3']

答案：A

5. 非贪婪匹配需要添加什么字符？

# A. *
# B. +
# C. ?
# D. #

答案：C

编程题

1. 验证邮箱格式：

import re

def validate_email(email):
    """验证邮箱格式"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

# 测试
emails = [
    'test@example.com',
    'user.name@domain.org',
    'invalid-email',
    '@nodomain.com',
    'missing@.com'
]

for email in emails:
    result = "✓ 有效" if validate_email(email) else "✗ 无效"
    print(f"{email}: {result}")

2. 提取 URL 信息：

import re

def extract_urls(text):
    """提取 URL"""
    pattern = r'https?://[a-zA-Z0-9.-]+(?:/[a-zA-Z0-9./_-]*)?'
    return re.findall(pattern, text)

text = """
访问 https://www.example.com 获取更多信息。
也可以访问 http://blog.example.org 或 https://shop.example.co.uk/products?id=123
"""

urls = extract_urls(text)
for url in urls:
    print(url)

3. 电话号码提取：

import re

def extract_phones(text):
    """提取各种格式的电话号码"""
    # 中国手机号（11位）
    phone_pattern = r'1[3-9]\d{9}'
    # 固话（区号-号码）
    tel_pattern = r'\d{3,4}-\d{7,8}'
    
    phones = []
    phones.extend(re.findall(phone_pattern, text))
    phones.extend(re.findall(tel_pattern, text))
    return phones

text = """
联系电话：13812345678
座机：010-12345678
手机：13998765432
"""
print(extract_phones(text))  # ['13812345678', '010-12345678', '13998765432']

4. 敏感信息脱敏：

import re

def mask_sensitive(text):
    """脱敏处理"""
    # 手机号脱敏
    text = re.sub(r'(1[3-9]\d{9})', lambda m: m.group(1)[:3] + '****' + m.group(1)[7:], text)
    # 邮箱脱敏
    text = re.sub(r'(\w{2})\w*(@)', r'\1***\2', text)
    return text

text = "手机: 13812345678, 邮箱: john@example.com"
print(mask_sensitive(text))  # 手机: 138****5678, 邮箱: jo***@example.com

问答题

Q1: 什么是正则表达式的贪婪匹配和非贪婪匹配？

贪婪匹配：量词会尽可能多地匹配字符，如 .* 会匹配尽可能多的字符。

非贪婪匹配：在量词后加 ?，如 .*? 会匹配尽可能少的字符。

text = '<div>content</div>'

# 贪婪
re.findall(r'<div>.*</div>', text)   # ['<div>content</div>']

# 非贪婪
re.findall(r'<div>.*?</div>', text)  # ['<div>content</div>']

Q2: re.findall() 和 re.finditer() 有什么区别？

方法	返回值	内存使用	适用场景
`findall()`	列表	高（一次性返回所有结果）	小数据量
`finditer()`	迭代器	低（逐个返回结果）	大数据量或流式处理

Q3: 字符类 [abc] 和快捷方式 \w 有什么区别？

字符类	说明
`[abc]`	匹配 `a`、`b` 或 `c` 中的任意一个字符
`\w`	匹配字母、数字、下划线 `[a-zA-Z0-9_]`

re.findall(r'[abc]', 'abcd')  # ['a', 'b', 'c']
re.findall(r'\w', 'abcd')     # ['a', 'b', 'c', 'd']

Day39 - 正则表达式基础

Day39 - 正则表达式基础

详细讲解

1. 正则表达式简介

2. re 模块主要函数

2.1 `re.match()` - 从开头匹配

2.2 `re.search()` - 搜索匹配

2.3 `re.findall()` - 查找所有匹配

2.4 `re.finditer()` - 迭代器版本

3. 字符类（Character Classes）

4. 量词（Quantifiers）

5. 边界匹配

6. 分组（Groups）

7. re.compile() 编译正则

8. re 模块的 flags

背诵版

核心速查

字符类速查

量词速查

考前记忆

面试重点

记忆口诀

测试题

选择题

编程题

问答题

参考资料

工具

🔓 解锁完整版

Day39 - 正则表达式基础#

详细讲解#

1. 正则表达式简介#

2. re 模块主要函数#

2.1 re.match() - 从开头匹配#

2.2 re.search() - 搜索匹配#

2.3 re.findall() - 查找所有匹配#

2.4 re.finditer() - 迭代器版本#

3. 字符类（Character Classes）#

4. 量词（Quantifiers）#

5. 边界匹配#

6. 分组（Groups）#

7. re.compile() 编译正则#

8. re 模块的 flags#

背诵版#

核心速查#

字符类速查#

量词速查#

考前记忆#

面试重点#

记忆口诀#

测试题#

选择题#

编程题#

问答题#

参考资料#

工具

🔓 解锁完整版

Day39 - 正则表达式基础

详细讲解

1. 正则表达式简介

2. re 模块主要函数

2.1 `re.match()` - 从开头匹配

2.2 `re.search()` - 搜索匹配

2.3 `re.findall()` - 查找所有匹配

2.4 `re.finditer()` - 迭代器版本

3. 字符类（Character Classes）

4. 量词（Quantifiers）

5. 边界匹配

6. 分组（Groups）

7. re.compile() 编译正则

8. re 模块的 flags

背诵版

核心速查

字符类速查

量词速查

考前记忆

面试重点

记忆口诀

测试题

选择题

编程题

问答题

参考资料