Regular Expressions in Python
Regular expressions (regex) are powerful patterns used to match character combinations in strings.
Python's re
module provides full support for Perl-like regular expressions,
enabling you to search, extract, and manipulate text with complex pattern matching.
Regex Fundamentals
Basic Patterns
Regular expressions consist of ordinary characters and special metacharacters that define search patterns.
Character | Meaning | Example |
---|---|---|
. | Any character except newline | a.b matches "acb", "adb", etc. |
^ | Start of string | ^hello matches "hello world" |
$ | End of string | world$ matches "hello world" |
* | 0 or more repetitions | ab*c matches "ac", "abc", "abbc" |
+ | 1 or more repetitions | ab+c matches "abc", "abbc" |
? | 0 or 1 repetition | ab?c matches "ac", "abc" |
{m,n} | m to n repetitions | a{2,4} matches "aa", "aaa", "aaaa" |
[] | Character set | [abc] matches "a", "b", or "c" |
[^] | Negated character set | [^abc] matches any char except a, b, c |
| | Alternation (OR) | a|b matches "a" or "b" |
() | Grouping | (abc)+ matches "abc", "abcabc" |
import re # Basic matching pattern = r"python" text = "I love python programming" match = re.search(pattern, text) print(match.group()) # Outputs: python # Using metacharacters pattern = r"py.*n" text = "python is fun" match = re.search(pattern, text) print(match.group()) # Outputs: python # Character classes pattern = r"[aeiou]" text = "python" matches = re.findall(pattern, text) print(matches) # Outputs: ['o'] # Anchors pattern = r"^py" text = "python starts with py" match = re.search(pattern, text) print(match.group()) # Outputs: py
Special Character Sequences
Python's re
module supports special character sequences for common pattern matching tasks.
Sequence | Meaning |
---|---|
\d | Any digit (equivalent to [0-9]) |
\D | Any non-digit (equivalent to [^0-9]) |
\w | Any word character (equivalent to [a-zA-Z0-9_]) |
\W | Any non-word character |
\s | Any whitespace character (space, tab, newline, etc.) |
\S | Any non-whitespace character |
\b | Word boundary |
\B | Non-word boundary |
import re # Match digits pattern = r"\d+" # One or more digits text = "I have 3 apples and 2 oranges" matches = re.findall(pattern, text) print(matches) # Outputs: ['3', '2'] # Match word characters pattern = r"\w{5,}" # Words with 5 or more characters text = "Python is an amazing programming language" matches = re.findall(pattern, text) print(matches) # Outputs: ['Python', 'amazing', 'programming', 'language'] # Word boundaries pattern = r"\bpy\w*\b" # Words starting with 'py' text = "python and pygame are both python-based" matches = re.findall(pattern, text, re.IGNORECASE) print(matches) # Outputs: ['python', 'pygame', 'python']
Regex Functions in Python
The re
module provides several functions to work with regular expressions in Python.
Function | Description |
---|---|
re.search() | Searches for the first match anywhere in the string |
re.match() | Searches for a match only at the beginning of the string |
re.fullmatch() | Searches for a match over the entire string |
re.findall() | Returns a list of all non-overlapping matches |
re.finditer() | Returns an iterator of all non-overlapping matches |
re.sub() | Replaces matches with a string or function result |
re.split() | Splits the string by the occurrences of the pattern |
re.compile() | Compiles a regex pattern for reuse |
import re text = "Python was created in 1991 by Guido van Rossum" # re.search() - Find the first match match = re.search(r"\d+", text) print(match.group()) # '1991' # re.match() - Match only at the beginning match = re.match(r"Python", text) print(match.group() if match else "No match") # 'Python' # re.fullmatch() - Match the entire string match = re.fullmatch(r"Python.*", text) print(match.group() if match else "No match") # Entire string # re.findall() - Find all matches matches = re.findall(r"\b\w{6}\b", text) # Words with exactly 6 letters print(matches) # ['Python', 'Rossum'] # re.finditer() - Find all matches with details for match in re.finditer(r"\b\w{6}\b", text): print(f"Match '{match.group()}' at position {match.start()}-{match.end()}") # re.sub() - Replace matches result = re.sub(r"\d+", "YEAR", text) print(result) # "Python was created in YEAR by Guido van Rossum" # re.split() - Split by pattern parts = re.split(r"\s+", text) # Split by whitespace print(parts) # ['Python', 'was', 'created', 'in', '1991', 'by', 'Guido', 'van', 'Rossum'] # re.compile() - Compile pattern for reuse word_pattern = re.compile(r"\b\w+\b") words = word_pattern.findall(text) print(len(words)) # 9
Capturing Groups & Common Patterns
Capturing groups let you extract specific parts of matched text and are defined with parentheses.
Working with Groups
import re # Basic capturing groups text = "Python 3.9 was released on October 5, 2020" match = re.search(r"Python (\d+\.\d+).*?(\d{4})", text) if match: version = match.group(1) # Access first group year = match.group(2) # Access second group full = match.group(0) # The entire match all_groups = match.groups() # All groups as tuple: ('3.9', '2020') print(f"Version: {version}, Year: {year}") print(f"Full match: {full}") # Named groups pattern = r"(?PPython) (?P \d+\.\d+).*?(?P \d{4})" match = re.search(pattern, text) if match: # Access by name print(f"Language: {match.group('language')}") print(f"Version: {match.group('version')}") print(f"Year: {match.group('year')}") # Get all as dictionary info = match.groupdict() print(info) # {'language': 'Python', 'version': '3.9', 'year': '2020'} # Non-capturing groups (?:...) # Useful for grouping without capturing pattern = r"Python (\d+\.\d+) (?:was released on) (.*)" match = re.search(pattern, text) if match: print(match.groups()) # ('3.9', 'October 5, 2020')
Common Regex Patterns
Pattern | Matches | Example |
---|---|---|
r"^\w+@\w+\.\w+$" | Basic email | user@example.com |
r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$" | IPv4 address | 192.168.1.1 |
r"^https?://[\w\-\.]+\.\w+(/\S*)?$" | URL | https://example.com/path |
r"^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$" | US Phone number | (123) 456-7890 |
r"^[a-zA-Z0-9]{8,12}$" | Username | User123 |
r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)[a-zA-Z\d]{8,}$" | Strong password | Password123 |
r"^\d{4}-\d{2}-\d{2}$" | Date (YYYY-MM-DD) | 2023-12-31 |
import re # Validate email def is_valid_email(email): pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$" # More comprehensive than basic pattern return bool(re.match(pattern, email)) print(is_valid_email("user@example.com")) # True print(is_valid_email("invalid-email")) # False # Extract data from text log_line = "192.168.1.1 - - [27/Oct/2023:10:45:29 +0000] \"GET /index.html HTTP/1.1\" 200 1234" # Extract IP, date, URL, and status code pattern = r'([\d\.]+).*\[([^\]]+)\].*"([A-Z]+) ([^ ]+).*" (\d+)' match = re.search(pattern, log_line) if match: ip = match.group(1) date = match.group(2) method = match.group(3) url = match.group(4) status = match.group(5) print(f"IP: {ip}") print(f"Date: {date}") print(f"Method: {method}") print(f"URL: {url}") print(f"Status: {status}")
Regex Flags
Regex flags modify how patterns match. In Python, they can be specified as an optional second parameter to regex functions or inline as part of the pattern.
Flag | Inline | Description |
---|---|---|
re.I or re.IGNORECASE | (?i) | Case-insensitive matching |
re.M or re.MULTILINE | (?m) | ^ and $ match start/end of each line |
re.S or re.DOTALL | (?s) | Dot (.) matches newline character |
re.X or re.VERBOSE | (?x) | Allows whitespace and comments in pattern |
re.A or re.ASCII | (?a) | Makes \w, \W, \b, \B, \d, \D match ASCII only |
re.U or re.UNICODE | (?u) | Makes pattern Unicode aware (default in Python 3) |
import re text = """Python is awesome. PYTHON is powerful. python is fast.""" # Case-insensitive flag matches = re.findall(r"python", text, re.IGNORECASE) print(matches) # ['Python', 'PYTHON', 'python'] # Using inline flag (?i) matches = re.findall(r"(?i)python", text) print(matches) # ['Python', 'PYTHON', 'python'] # Multiline flag # ^ matches beginning of each line multiline_text = """First line Second line Third line""" # Without multiline flag - only matches start of string matches = re.findall(r"^.*line", multiline_text) print(matches) # ['First line'] # With multiline flag - matches start of each line matches = re.findall(r"^.*line", multiline_text, re.MULTILINE) print(matches) # ['First line', 'Second line', 'Third line'] # Verbose flag for readable patterns phone_pattern = re.compile(r""" \(?\d{3}\)? # Area code (optional parentheses) [-.\s]? # Optional separator \d{3} # First 3 digits [-.\s]? # Optional separator \d{4} # Last 4 digits """, re.VERBOSE) print(phone_pattern.match("(123) 456-7890")) # Match object print(phone_pattern.match("123.456.7890")) # Match object print(phone_pattern.match("1234567890")) # Match object
Best Practices & Performance Tips
- Compile Patterns: If you use the same pattern multiple times, compile it with
re.compile()
for better performance. - Be Specific: Make patterns as specific as possible to avoid unnecessary backtracking, which can cause performance issues.
- Use Raw Strings: Always use raw strings (r"...") for regex patterns to avoid issues with backslashes.
- Test Thoroughly: Test your patterns against various inputs, including edge cases.
- Use Non-Capturing Groups: When you don't need to extract grouped content, use non-capturing groups
(?:...)
for better performance. - Avoid Catastrophic Backtracking: Be cautious with nested repetition quantifiers (
*
,+
,{m,n}
) which can cause exponential time complexity. - Readability: Use the verbose flag (
re.VERBOSE
) for complex patterns to improve readability and maintainability.
Common Pitfalls
import re import time # Pitfall 1: Catastrophic backtracking bad_pattern = r"(a+)+" text = "a" * 30 + "b" # 30 'a's followed by 'b' # This will take exponential time # start = time.time() # re.match(bad_pattern, text) # print(f"Time: {time.time() - start} seconds") # Could take very long # Better pattern for matching repeated 'a's good_pattern = r"a+b" start = time.time() re.match(good_pattern, text) print(f"Time: {time.time() - start} seconds") # Very fast # Pitfall 2: Greedy vs non-greedy quantifiers html = "First divSecond div" # Greedy - matches as much as possible greedy = re.search(r".*", html) print(greedy.group()) # "First divSecond div" # Non-greedy - matches as little as possible non_greedy = re.search(r".*?", html) print(non_greedy.group()) # "First div" # Pitfall 3: Not escaping special characters # This fails because . is a special character print(re.search(r"domain.com", "domain.com")) # Works but for wrong reasons print(re.search(r"domain.com", "domainXcom")) # Still matches! # Correct way - escape the dot print(re.search(r"domain\.com", "domain.com")) # Matches print(re.search(r"domain\.com", "domainXcom")) # No match
Practice Exercises
Try solving these regex challenges to strengthen your understanding:
- Write a regex to validate email addresses (covering common formats).
- Create a pattern to extract all the hashtags from a social media post.
- Write a pattern to validate a date in the format YYYY-MM-DD.
- Create a regex to extract all URLs from a text.
- Write a pattern to mask credit card numbers, showing only the last 4 digits.
Example Solution (Extracting Hashtags)
import re text = "I love #Python and #RegularExpressions #programming #coding" # Extract all hashtags hashtags = re.findall(r"#\w+", text) print(hashtags) # ['#Python', '#RegularExpressions', '#programming', '#coding'] # If you want the text without the # character hashtag_text = [tag[1:] for tag in hashtags] print(hashtag_text) # ['Python', 'RegularExpressions', 'programming', 'coding']