Python Web Scraping

Web scraping is the process of extracting data from websites. Python offers several libraries that make web scraping straightforward, allowing you to collect data for analysis, monitoring, research, or integration with other applications.

Basic Web Scraping with Requests and BeautifulSoup

The most common approach to web scraping in Python uses the Requests library to fetch web pages and BeautifulSoup to parse and extract data from the HTML.

# Install required packages
# pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup

# Fetch a web page
url = "https://example.com"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find elements by tag
    all_paragraphs = soup.find_all('p')
    for p in all_paragraphs:
        print(p.text.strip())
    
    # Find elements by CSS selector
    main_heading = soup.select_one('h1')
    print(f"Main heading: {main_heading.text if main_heading else 'Not found'}")
    
    # Find element by ID
    main_content = soup.find(id="main")
    
    # Find elements by class
    navigation_links = soup.find_all('a', class_='nav-link')
    
    # Find elements with specific attributes
    external_links = soup.find_all('a', attrs={'rel': 'external'})
    
    # Extract attribute values
    for link in soup.find_all('a'):
        href = link.get('href')
        print(f"Link: {href}")
else:
    print(f"Failed to retrieve the page: {response.status_code}")

Navigating and Searching with BeautifulSoup

BeautifulSoup offers multiple ways to navigate and search through HTML documents.

from bs4 import BeautifulSoup

html_doc = """
<html>
  <head>
    <title>Sample Page</title>
  </head>
  <body>
    <div id="content">
      <h1 class="title">Welcome to the Page</h1>
      <p class="intro">This is an introduction paragraph.</p>
      <div class="section">
        <h2>Section 1</h2>
        <p>Content for section 1.</p>
        <ul>
          <li><a href="https://example.com/1">Link 1</a></li>
          <li><a href="https://example.com/2">Link 2</a></li>
        </ul>
      </div>
      <div class="section">
        <h2>Section 2</h2>
        <p>Content for section 2.</p>
      </div>
    </div>
    <footer>
      <p>Copyright 2023</p>
    </footer>
  </body>
</html>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

# Navigating down the tree
content_div = soup.find(id="content")
sections = content_div.find_all("div", class_="section")
print(f"Found {len(sections)} sections")

# Navigating up the tree
first_link = soup.find('a')
parent_li = first_link.parent
parent_ul = parent_li.parent
print(f"Parent of link: {parent_li.name}")
print(f"Parent of list item: {parent_ul.name}")

# Navigating sideways
first_section = soup.find("div", class_="section")
next_section = first_section.find_next_sibling("div", class_="section")
print(f"Next section heading: {next_section.h2.text}")

# Complex navigation
# Find all links in the first section
links_in_first_section = first_section.find_all('a')
print("Links in first section:")
for link in links_in_first_section:
    print(f"- {link.text}: {link['href']}")

# Using CSS selectors
# Find all paragraphs inside section divs
section_paragraphs = soup.select("div.section p")
print("Section paragraphs:")
for p in section_paragraphs:
    print(f"- {p.text}")

More Advanced Techniques

Handling JavaScript

Many modern websites use JavaScript to load content dynamically. For these sites, you might need to use a headless browser like Selenium or Playwright.

# pip install selenium
# Also need to install a webdriver like chromedriver

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no browser UI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Initialize the WebDriver
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)

# Navigate to a webpage
url = "https://example.com"
driver.get(url)

# Wait for JavaScript to load content
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Wait up to 10 seconds for the element to be present
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "dynamically-loaded-element"))
)

# Find elements by various methods
# By ID
element_by_id = driver.find_element(By.ID, "element-id")

# By class name
elements_by_class = driver.find_elements(By.CLASS_NAME, "element-class")

# By CSS selector
element_by_css = driver.find_element(By.CSS_SELECTOR, "div#main > p.intro")

# By XPath
element_by_xpath = driver.find_element(By.XPATH, "//div[@id='main']/p")

# Extract text and attributes
text = element_by_id.text
attribute = element_by_id.get_attribute("href")

# Interacting with elements
# Click a button
button = driver.find_element(By.ID, "submit-button")
button.click()

# Fill a form
input_field = driver.find_element(By.NAME, "username")
input_field.send_keys("my_username")

# Clean up
driver.quit()

Data Extraction and Storage

After scraping, you'll typically want to structure and store the data.

import requests
from bs4 import BeautifulSoup
import csv
import json
import pandas as pd

url = "https://example.com/products"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract product information
products = []
for product_div in soup.find_all('div', class_='product'):
    product = {
        'name': product_div.find('h3', class_='product-name').text.strip(),
        'price': product_div.find('span', class_='price').text.strip(),
        'rating': float(product_div.find('div', class_='rating').get('data-rating', 0)),
        'url': product_div.find('a', class_='product-link')['href']
    }
    products.append(product)

# Save to CSV
with open('products.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['name', 'price', 'rating', 'url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for product in products:
        writer.writerow(product)

# Save to JSON
with open('products.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(products, jsonfile, indent=4)

# Using pandas
df = pd.DataFrame(products)
print(df.head())

# Save to Excel
df.to_excel('products.xlsx', index=False)

# Save to SQLite database
import sqlite3

conn = sqlite3.connect('products.db')
df.to_sql('products', conn, if_exists='replace', index=False)
conn.close()

Best Practices and Ethics

Ethical Considerations

  • Always review the website's Terms of Service and robots.txt
  • Respect rate limits and consider adding delays between requests
  • Don't overload servers with excessive requests
  • Consider using official APIs if available
  • Don't scrape personal information or protected content

Technical Best Practices

  • Use proper error handling for resilient scraping
  • Set appropriate timeouts and retries
  • Use a rotating IP or proxy if necessary
  • Implement user agents to identify your scraper
  • Cache responses to reduce requests
  • Implement incremental scraping for large datasets
import requests
import time
from bs4 import BeautifulSoup
import random
from fake_useragent import UserAgent

# Create a session for maintaining cookies and headers
session = requests.Session()

# Set a random user agent
ua = UserAgent()
session.headers.update({'User-Agent': ua.random})

# Function to scrape with rate limiting and error handling
def scrape_page(url, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            # Random delay to be polite (1-3 seconds)
            time.sleep(random.uniform(1, 3))
            
            # Make the request
            response = session.get(url, timeout=10)
            
            # Check for common HTTP errors
            response.raise_for_status()
            
            # Return soup object if successful
            return BeautifulSoup(response.text, 'html.parser')
            
        except requests.RequestException as e:
            print(f"Error scraping {url}: {e}")
            retries += 1
            # Exponential backoff
            time.sleep(2 ** retries)
    
    # If all retries fail
    print(f"Failed to scrape {url} after {max_retries} attempts")
    return None

# Function to check robots.txt
def is_allowed(url, user_agent="*"):
    import urllib.robotparser
    
    # Parse the URL to get the base domain
    from urllib.parse import urlparse
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    
    # Initialize the RobotFileParser
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(f"{base_url}/robots.txt")
    
    try:
        rp.read()
        return rp.can_fetch(user_agent, url)
    except:
        # If robots.txt can't be read, assume scraping is allowed
        return True

# Example usage
url_to_scrape = "https://example.com/products"

if is_allowed(url_to_scrape, session.headers['User-Agent']):
    soup = scrape_page(url_to_scrape)
    if soup:
        # Process the page
        print("Successfully scraped the page")
else:
    print("Scraping this URL is not allowed by robots.txt")

Practice Exercises

  1. Build a scraper to extract article titles and summaries from a news website.
  2. Create a price monitoring tool for a product across multiple e-commerce websites.
  3. Develop a script to download and organize images from a gallery website.
  4. Build a scraper to extract job listings from a job board.
  5. Create a weather data collector that scrapes forecast information.