Python Web Scraping
Web scraping is the process of extracting data from websites. Python offers several libraries that make web scraping straightforward, allowing you to collect data for analysis, monitoring, research, or integration with other applications.
Basic Web Scraping with Requests and BeautifulSoup
The most common approach to web scraping in Python uses the Requests library to fetch web pages and BeautifulSoup to parse and extract data from the HTML.
# Install required packages # pip install requests beautifulsoup4 import requests from bs4 import BeautifulSoup # Fetch a web page url = "https://example.com" response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Find elements by tag all_paragraphs = soup.find_all('p') for p in all_paragraphs: print(p.text.strip()) # Find elements by CSS selector main_heading = soup.select_one('h1') print(f"Main heading: {main_heading.text if main_heading else 'Not found'}") # Find element by ID main_content = soup.find(id="main") # Find elements by class navigation_links = soup.find_all('a', class_='nav-link') # Find elements with specific attributes external_links = soup.find_all('a', attrs={'rel': 'external'}) # Extract attribute values for link in soup.find_all('a'): href = link.get('href') print(f"Link: {href}") else: print(f"Failed to retrieve the page: {response.status_code}")
Navigating and Searching with BeautifulSoup
BeautifulSoup offers multiple ways to navigate and search through HTML documents.
from bs4 import BeautifulSoup html_doc = """ <html> <head> <title>Sample Page</title> </head> <body> <div id="content"> <h1 class="title">Welcome to the Page</h1> <p class="intro">This is an introduction paragraph.</p> <div class="section"> <h2>Section 1</h2> <p>Content for section 1.</p> <ul> <li><a href="https://example.com/1">Link 1</a></li> <li><a href="https://example.com/2">Link 2</a></li> </ul> </div> <div class="section"> <h2>Section 2</h2> <p>Content for section 2.</p> </div> </div> <footer> <p>Copyright 2023</p> </footer> </body> </html> """ soup = BeautifulSoup(html_doc, 'html.parser') # Navigating down the tree content_div = soup.find(id="content") sections = content_div.find_all("div", class_="section") print(f"Found {len(sections)} sections") # Navigating up the tree first_link = soup.find('a') parent_li = first_link.parent parent_ul = parent_li.parent print(f"Parent of link: {parent_li.name}") print(f"Parent of list item: {parent_ul.name}") # Navigating sideways first_section = soup.find("div", class_="section") next_section = first_section.find_next_sibling("div", class_="section") print(f"Next section heading: {next_section.h2.text}") # Complex navigation # Find all links in the first section links_in_first_section = first_section.find_all('a') print("Links in first section:") for link in links_in_first_section: print(f"- {link.text}: {link['href']}") # Using CSS selectors # Find all paragraphs inside section divs section_paragraphs = soup.select("div.section p") print("Section paragraphs:") for p in section_paragraphs: print(f"- {p.text}")
More Advanced Techniques
Handling JavaScript
Many modern websites use JavaScript to load content dynamically. For these sites, you might need to use a headless browser like Selenium or Playwright.
# pip install selenium # Also need to install a webdriver like chromedriver from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager # Set up Chrome options chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode (no browser UI) chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") # Initialize the WebDriver driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=chrome_options ) # Navigate to a webpage url = "https://example.com" driver.get(url) # Wait for JavaScript to load content from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # Wait up to 10 seconds for the element to be present element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "dynamically-loaded-element")) ) # Find elements by various methods # By ID element_by_id = driver.find_element(By.ID, "element-id") # By class name elements_by_class = driver.find_elements(By.CLASS_NAME, "element-class") # By CSS selector element_by_css = driver.find_element(By.CSS_SELECTOR, "div#main > p.intro") # By XPath element_by_xpath = driver.find_element(By.XPATH, "//div[@id='main']/p") # Extract text and attributes text = element_by_id.text attribute = element_by_id.get_attribute("href") # Interacting with elements # Click a button button = driver.find_element(By.ID, "submit-button") button.click() # Fill a form input_field = driver.find_element(By.NAME, "username") input_field.send_keys("my_username") # Clean up driver.quit()
Data Extraction and Storage
After scraping, you'll typically want to structure and store the data.
import requests from bs4 import BeautifulSoup import csv import json import pandas as pd url = "https://example.com/products" response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # Extract product information products = [] for product_div in soup.find_all('div', class_='product'): product = { 'name': product_div.find('h3', class_='product-name').text.strip(), 'price': product_div.find('span', class_='price').text.strip(), 'rating': float(product_div.find('div', class_='rating').get('data-rating', 0)), 'url': product_div.find('a', class_='product-link')['href'] } products.append(product) # Save to CSV with open('products.csv', 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['name', 'price', 'rating', 'url'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for product in products: writer.writerow(product) # Save to JSON with open('products.json', 'w', encoding='utf-8') as jsonfile: json.dump(products, jsonfile, indent=4) # Using pandas df = pd.DataFrame(products) print(df.head()) # Save to Excel df.to_excel('products.xlsx', index=False) # Save to SQLite database import sqlite3 conn = sqlite3.connect('products.db') df.to_sql('products', conn, if_exists='replace', index=False) conn.close()
Best Practices and Ethics
Ethical Considerations
- Always review the website's Terms of Service and robots.txt
- Respect rate limits and consider adding delays between requests
- Don't overload servers with excessive requests
- Consider using official APIs if available
- Don't scrape personal information or protected content
Technical Best Practices
- Use proper error handling for resilient scraping
- Set appropriate timeouts and retries
- Use a rotating IP or proxy if necessary
- Implement user agents to identify your scraper
- Cache responses to reduce requests
- Implement incremental scraping for large datasets
import requests import time from bs4 import BeautifulSoup import random from fake_useragent import UserAgent # Create a session for maintaining cookies and headers session = requests.Session() # Set a random user agent ua = UserAgent() session.headers.update({'User-Agent': ua.random}) # Function to scrape with rate limiting and error handling def scrape_page(url, max_retries=3): retries = 0 while retries < max_retries: try: # Random delay to be polite (1-3 seconds) time.sleep(random.uniform(1, 3)) # Make the request response = session.get(url, timeout=10) # Check for common HTTP errors response.raise_for_status() # Return soup object if successful return BeautifulSoup(response.text, 'html.parser') except requests.RequestException as e: print(f"Error scraping {url}: {e}") retries += 1 # Exponential backoff time.sleep(2 ** retries) # If all retries fail print(f"Failed to scrape {url} after {max_retries} attempts") return None # Function to check robots.txt def is_allowed(url, user_agent="*"): import urllib.robotparser # Parse the URL to get the base domain from urllib.parse import urlparse parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # Initialize the RobotFileParser rp = urllib.robotparser.RobotFileParser() rp.set_url(f"{base_url}/robots.txt") try: rp.read() return rp.can_fetch(user_agent, url) except: # If robots.txt can't be read, assume scraping is allowed return True # Example usage url_to_scrape = "https://example.com/products" if is_allowed(url_to_scrape, session.headers['User-Agent']): soup = scrape_page(url_to_scrape) if soup: # Process the page print("Successfully scraped the page") else: print("Scraping this URL is not allowed by robots.txt")
Practice Exercises
- Build a scraper to extract article titles and summaries from a news website.
- Create a price monitoring tool for a product across multiple e-commerce websites.
- Develop a script to download and organize images from a gallery website.
- Build a scraper to extract job listings from a job board.
- Create a weather data collector that scrapes forecast information.