Web Scraping với Python: BeautifulSoup và Requests

Học web scraping để thu thập dữ liệu - requests, BeautifulSoup, và kỹ thuật bypass protection.

Web Scraping là gì?

Web Scraping là kỹ thuật tự động thu thập dữ liệu từ websites. Rất hữu ích cho:

  • Reconnaissance: Thu thập thông tin về target
  • Data collection: Lấy dữ liệu công khai
  • Automation: Tự động hóa tasks lặp lại

Cài đặt Libraries

pip install requests beautifulsoup4 lxml

HTTP Requests với requests

import requests

# GET request
response = requests.get("https://httpbin.org/get")
print(response.status_code)  # 200
print(response.text)         # HTML/JSON content
print(response.headers)      # Response headers

# POST request
data = {"username": "admin", "password": "secret"}
response = requests.post("https://httpbin.org/post", data=data)

# JSON response
response = requests.get("https://api.github.com/users/octocat")
user_data = response.json()
print(user_data["name"])

# Custom headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "text/html",
}
response = requests.get("https://example.com", headers=headers)

Parsing HTML với BeautifulSoup

from bs4 import BeautifulSoup
import requests

# Fetch và parse
url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")

# Find elements
title = soup.title.text
print(f"Title: {title}")

# Find by tag
all_links = soup.find_all("a")
for link in all_links:
    href = link.get("href")
    text = link.text
    print(f"{text}: {href}")

# Find by class
items = soup.find_all("div", class_="item")

# Find by id
header = soup.find(id="main-header")

# CSS selectors
elements = soup.select("div.container > p.intro")

Trích xuất dữ liệu

from bs4 import BeautifulSoup
import requests

def scrape_product_info(url: str) -> dict:
    """Scrape product information from page."""
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    
    return {
        "title": soup.find("h1", class_="product-title").text.strip(),
        "price": soup.find("span", class_="price").text.strip(),
        "description": soup.find("div", class_="description").text.strip(),
        "images": [img["src"] for img in soup.find_all("img", class_="product-image")]
    }

# Extract all links
def extract_links(url: str) -> list:
    """Extract all links from a page."""
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    
    links = []
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.startswith("http"):
            links.append(href)
    
    return links

Session và Cookies

import requests

# Sử dụng session để maintain cookies
session = requests.Session()

# Login
login_data = {
    "username": "testuser",
    "password": "testpass"
}
session.post("https://example.com/login", data=login_data)

# Requests tiếp theo sẽ có cookie
profile = session.get("https://example.com/profile")
print(profile.text)

# Thêm cookies manually
session.cookies.set("session_id", "abc123")

Handling Pagination

import requests
from bs4 import BeautifulSoup

def scrape_all_pages(base_url: str, max_pages: int = 10) -> list:
    """Scrape multiple pages."""
    
    all_items = []
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        response = requests.get(url)
        
        if response.status_code != 200:
            break
        
        soup = BeautifulSoup(response.text, "lxml")
        items = soup.find_all("div", class_="item")
        
        if not items:
            break
        
        for item in items:
            all_items.append({
                "title": item.find("h2").text,
                "link": item.find("a")["href"]
            })
        
        print(f"Scraped page {page}, found {len(items)} items")
    
    return all_items

Bypassing Protection

import requests
import time
import random

# Rotate User-Agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]

def get_with_random_ua(url: str) -> requests.Response:
    headers = {"User-Agent": random.choice(USER_AGENTS)}
    return requests.get(url, headers=headers)

# Rate limiting
def scrape_with_delay(urls: list, delay: float = 1.0) -> list:
    results = []
    
    for url in urls:
        response = get_with_random_ua(url)
        results.append(response.text)
        time.sleep(delay)  # Be respectful!
    
    return results

# Retry on failure
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def get_session_with_retry() -> requests.Session:
    session = requests.Session()
    
    retry = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504]
    )
    
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

Ứng dụng: Subdomain Finder

import requests
from concurrent.futures import ThreadPoolExecutor
from typing import List

def check_subdomain(domain: str, subdomain: str) -> str | None:
    """Check if subdomain exists."""
    
    url = f"http://{subdomain}.{domain}"
    
    try:
        response = requests.get(url, timeout=3)
        if response.status_code < 400:
            return subdomain
    except requests.RequestException:
        pass
    
    return None

def find_subdomains(domain: str, wordlist: List[str], threads: int = 20) -> List[str]:
    """Find existing subdomains using wordlist."""
    
    found = []
    
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures = [
            executor.submit(check_subdomain, domain, sub) 
            for sub in wordlist
        ]
        
        for future in futures:
            result = future.result()
            if result:
                found.append(result)
                print(f"[+] Found: {result}.{domain}")
    
    return found

# Usage
if __name__ == "__main__":
    wordlist = ["www", "mail", "ftp", "admin", "api", "dev", "test"]
    subdomains = find_subdomains("example.com", wordlist)
    print(f"\nFound {len(subdomains)} subdomains")

Ứng dụng: Email Harvester

import re
import requests
from bs4 import BeautifulSoup
from typing import Set

def extract_emails(text: str) -> Set[str]:
    """Extract emails using regex."""
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    return set(re.findall(pattern, text))

def harvest_emails(url: str) -> Set[str]:
    """Harvest emails from a webpage."""
    
    try:
        response = requests.get(url, timeout=10)
        emails = extract_emails(response.text)
        return emails
    except requests.RequestException:
        return set()

# Usage
emails = harvest_emails("https://example.com/contact")
print(f"Found emails: {emails}")

Best Practices

  1. Respect robots.txt: Kiểm tra trước khi scrape
  2. Rate limiting: Không spam requests
  3. User-Agent: Sử dụng realistic user agent
  4. Error handling: Handle failures gracefully
  5. Legal considerations: Chỉ scrape dữ liệu được phép

Bước tiếp theo

Tiếp theo trong khóa học:

  • Automation Scripts: Tự động hóa tasks
  • Cryptography cơ bản: Hashing, encryption

⚠️ Legal Notice: Chỉ scrape websites mà bạn có quyền. Luôn kiểm tra Terms of Service!