Regular Expressions in Python

Regular expressions (regex) are powerful patterns used to match character combinations in strings. Python's re module provides full support for Perl-like regular expressions, enabling you to search, extract, and manipulate text with complex pattern matching.

Regex Fundamentals

Basic Patterns

Regular expressions consist of ordinary characters and special metacharacters that define search patterns.

Character Meaning Example
. Any character except newline a.b matches "acb", "adb", etc.
^ Start of string ^hello matches "hello world"
$ End of string world$ matches "hello world"
* 0 or more repetitions ab*c matches "ac", "abc", "abbc"
+ 1 or more repetitions ab+c matches "abc", "abbc"
? 0 or 1 repetition ab?c matches "ac", "abc"
{m,n} m to n repetitions a{2,4} matches "aa", "aaa", "aaaa"
[] Character set [abc] matches "a", "b", or "c"
[^] Negated character set [^abc] matches any char except a, b, c
| Alternation (OR) a|b matches "a" or "b"
() Grouping (abc)+ matches "abc", "abcabc"
import re

# Basic matching
pattern = r"python"
text = "I love python programming"
match = re.search(pattern, text)
print(match.group())  # Outputs: python

# Using metacharacters
pattern = r"py.*n"
text = "python is fun"
match = re.search(pattern, text)
print(match.group())  # Outputs: python

# Character classes
pattern = r"[aeiou]"
text = "python"
matches = re.findall(pattern, text)
print(matches)  # Outputs: ['o']

# Anchors
pattern = r"^py"
text = "python starts with py"
match = re.search(pattern, text)
print(match.group())  # Outputs: py

Special Character Sequences

Python's re module supports special character sequences for common pattern matching tasks.

Sequence Meaning
\d Any digit (equivalent to [0-9])
\D Any non-digit (equivalent to [^0-9])
\w Any word character (equivalent to [a-zA-Z0-9_])
\W Any non-word character
\s Any whitespace character (space, tab, newline, etc.)
\S Any non-whitespace character
\b Word boundary
\B Non-word boundary
import re

# Match digits
pattern = r"\d+"  # One or more digits
text = "I have 3 apples and 2 oranges"
matches = re.findall(pattern, text)
print(matches)  # Outputs: ['3', '2']

# Match word characters
pattern = r"\w{5,}"  # Words with 5 or more characters
text = "Python is an amazing programming language"
matches = re.findall(pattern, text)
print(matches)  # Outputs: ['Python', 'amazing', 'programming', 'language']

# Word boundaries
pattern = r"\bpy\w*\b"  # Words starting with 'py'
text = "python and pygame are both python-based"
matches = re.findall(pattern, text, re.IGNORECASE)
print(matches)  # Outputs: ['python', 'pygame', 'python']

Regex Functions in Python

The re module provides several functions to work with regular expressions in Python.

Function Description
re.search() Searches for the first match anywhere in the string
re.match() Searches for a match only at the beginning of the string
re.fullmatch() Searches for a match over the entire string
re.findall() Returns a list of all non-overlapping matches
re.finditer() Returns an iterator of all non-overlapping matches
re.sub() Replaces matches with a string or function result
re.split() Splits the string by the occurrences of the pattern
re.compile() Compiles a regex pattern for reuse
import re

text = "Python was created in 1991 by Guido van Rossum"

# re.search() - Find the first match
match = re.search(r"\d+", text)
print(match.group())  # '1991'

# re.match() - Match only at the beginning
match = re.match(r"Python", text)
print(match.group() if match else "No match")  # 'Python'

# re.fullmatch() - Match the entire string
match = re.fullmatch(r"Python.*", text)
print(match.group() if match else "No match")  # Entire string

# re.findall() - Find all matches
matches = re.findall(r"\b\w{6}\b", text)  # Words with exactly 6 letters
print(matches)  # ['Python', 'Rossum']

# re.finditer() - Find all matches with details
for match in re.finditer(r"\b\w{6}\b", text):
    print(f"Match '{match.group()}' at position {match.start()}-{match.end()}")

# re.sub() - Replace matches
result = re.sub(r"\d+", "YEAR", text)
print(result)  # "Python was created in YEAR by Guido van Rossum"

# re.split() - Split by pattern
parts = re.split(r"\s+", text)  # Split by whitespace
print(parts)  # ['Python', 'was', 'created', 'in', '1991', 'by', 'Guido', 'van', 'Rossum']

# re.compile() - Compile pattern for reuse
word_pattern = re.compile(r"\b\w+\b")
words = word_pattern.findall(text)
print(len(words))  # 9

Capturing Groups & Common Patterns

Capturing groups let you extract specific parts of matched text and are defined with parentheses.

Working with Groups

import re

# Basic capturing groups
text = "Python 3.9 was released on October 5, 2020"
match = re.search(r"Python (\d+\.\d+).*?(\d{4})", text)

if match:
    version = match.group(1)  # Access first group
    year = match.group(2)     # Access second group
    full = match.group(0)     # The entire match
    all_groups = match.groups()  # All groups as tuple: ('3.9', '2020')
    
    print(f"Version: {version}, Year: {year}")
    print(f"Full match: {full}")

# Named groups
pattern = r"(?PPython) (?P\d+\.\d+).*?(?P\d{4})"
match = re.search(pattern, text)

if match:
    # Access by name
    print(f"Language: {match.group('language')}")
    print(f"Version: {match.group('version')}")
    print(f"Year: {match.group('year')}")
    
    # Get all as dictionary
    info = match.groupdict()
    print(info)  # {'language': 'Python', 'version': '3.9', 'year': '2020'}

# Non-capturing groups (?:...)
# Useful for grouping without capturing
pattern = r"Python (\d+\.\d+) (?:was released on) (.*)"
match = re.search(pattern, text)
if match:
    print(match.groups())  # ('3.9', 'October 5, 2020')

Common Regex Patterns

Pattern Matches Example
r"^\w+@\w+\.\w+$" Basic email user@example.com
r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$" IPv4 address 192.168.1.1
r"^https?://[\w\-\.]+\.\w+(/\S*)?$" URL https://example.com/path
r"^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$" US Phone number (123) 456-7890
r"^[a-zA-Z0-9]{8,12}$" Username User123
r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)[a-zA-Z\d]{8,}$" Strong password Password123
r"^\d{4}-\d{2}-\d{2}$" Date (YYYY-MM-DD) 2023-12-31
import re

# Validate email
def is_valid_email(email):
    pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"  # More comprehensive than basic pattern
    return bool(re.match(pattern, email))

print(is_valid_email("user@example.com"))  # True
print(is_valid_email("invalid-email"))     # False

# Extract data from text
log_line = "192.168.1.1 - - [27/Oct/2023:10:45:29 +0000] \"GET /index.html HTTP/1.1\" 200 1234"

# Extract IP, date, URL, and status code
pattern = r'([\d\.]+).*\[([^\]]+)\].*"([A-Z]+) ([^ ]+).*" (\d+)'
match = re.search(pattern, log_line)

if match:
    ip = match.group(1)
    date = match.group(2)
    method = match.group(3)
    url = match.group(4)
    status = match.group(5)
    
    print(f"IP: {ip}")
    print(f"Date: {date}")
    print(f"Method: {method}")
    print(f"URL: {url}")
    print(f"Status: {status}")

Regex Flags

Regex flags modify how patterns match. In Python, they can be specified as an optional second parameter to regex functions or inline as part of the pattern.

Flag Inline Description
re.I or re.IGNORECASE (?i) Case-insensitive matching
re.M or re.MULTILINE (?m) ^ and $ match start/end of each line
re.S or re.DOTALL (?s) Dot (.) matches newline character
re.X or re.VERBOSE (?x) Allows whitespace and comments in pattern
re.A or re.ASCII (?a) Makes \w, \W, \b, \B, \d, \D match ASCII only
re.U or re.UNICODE (?u) Makes pattern Unicode aware (default in Python 3)
import re

text = """Python is awesome.
PYTHON is powerful.
python is fast."""

# Case-insensitive flag
matches = re.findall(r"python", text, re.IGNORECASE)
print(matches)  # ['Python', 'PYTHON', 'python']

# Using inline flag (?i)
matches = re.findall(r"(?i)python", text)
print(matches)  # ['Python', 'PYTHON', 'python']

# Multiline flag
# ^ matches beginning of each line
multiline_text = """First line
Second line
Third line"""

# Without multiline flag - only matches start of string
matches = re.findall(r"^.*line", multiline_text)
print(matches)  # ['First line']

# With multiline flag - matches start of each line
matches = re.findall(r"^.*line", multiline_text, re.MULTILINE)
print(matches)  # ['First line', 'Second line', 'Third line']

# Verbose flag for readable patterns
phone_pattern = re.compile(r"""
    \(?\d{3}\)?     # Area code (optional parentheses)
    [-.\s]?         # Optional separator
    \d{3}           # First 3 digits
    [-.\s]?         # Optional separator
    \d{4}           # Last 4 digits
    """, re.VERBOSE)

print(phone_pattern.match("(123) 456-7890"))  # Match object
print(phone_pattern.match("123.456.7890"))    # Match object
print(phone_pattern.match("1234567890"))      # Match object

Best Practices & Performance Tips

  • Compile Patterns: If you use the same pattern multiple times, compile it with re.compile() for better performance.
  • Be Specific: Make patterns as specific as possible to avoid unnecessary backtracking, which can cause performance issues.
  • Use Raw Strings: Always use raw strings (r"...") for regex patterns to avoid issues with backslashes.
  • Test Thoroughly: Test your patterns against various inputs, including edge cases.
  • Use Non-Capturing Groups: When you don't need to extract grouped content, use non-capturing groups (?:...) for better performance.
  • Avoid Catastrophic Backtracking: Be cautious with nested repetition quantifiers (*, +, {m,n}) which can cause exponential time complexity.
  • Readability: Use the verbose flag (re.VERBOSE) for complex patterns to improve readability and maintainability.

Common Pitfalls

import re
import time

# Pitfall 1: Catastrophic backtracking
bad_pattern = r"(a+)+"
text = "a" * 30 + "b"  # 30 'a's followed by 'b'

# This will take exponential time
# start = time.time()
# re.match(bad_pattern, text)
# print(f"Time: {time.time() - start} seconds")  # Could take very long

# Better pattern for matching repeated 'a's
good_pattern = r"a+b"
start = time.time()
re.match(good_pattern, text)
print(f"Time: {time.time() - start} seconds")  # Very fast

# Pitfall 2: Greedy vs non-greedy quantifiers
html = "
First div
Second div
" # Greedy - matches as much as possible greedy = re.search(r"
.*
", html) print(greedy.group()) # "
First div
Second div
"
# Non-greedy - matches as little as possible non_greedy = re.search(r"
.*?
", html) print(non_greedy.group()) # "
First div
"
# Pitfall 3: Not escaping special characters # This fails because . is a special character print(re.search(r"domain.com", "domain.com")) # Works but for wrong reasons print(re.search(r"domain.com", "domainXcom")) # Still matches! # Correct way - escape the dot print(re.search(r"domain\.com", "domain.com")) # Matches print(re.search(r"domain\.com", "domainXcom")) # No match

Practice Exercises

Try solving these regex challenges to strengthen your understanding:

  1. Write a regex to validate email addresses (covering common formats).
  2. Create a pattern to extract all the hashtags from a social media post.
  3. Write a pattern to validate a date in the format YYYY-MM-DD.
  4. Create a regex to extract all URLs from a text.
  5. Write a pattern to mask credit card numbers, showing only the last 4 digits.

Example Solution (Extracting Hashtags)

import re

text = "I love #Python and #RegularExpressions #programming #coding"

# Extract all hashtags
hashtags = re.findall(r"#\w+", text)
print(hashtags)  # ['#Python', '#RegularExpressions', '#programming', '#coding']

# If you want the text without the # character
hashtag_text = [tag[1:] for tag in hashtags]
print(hashtag_text)  # ['Python', 'RegularExpressions', 'programming', 'coding']