Web Scraping là gì?
Web Scraping là kỹ thuật tự động thu thập dữ liệu từ websites. Rất hữu ích cho:
- Reconnaissance: Thu thập thông tin về target
- Data collection: Lấy dữ liệu công khai
- Automation: Tự động hóa tasks lặp lại
Cài đặt Libraries
pip install requests beautifulsoup4 lxml
HTTP Requests với requests
import requests
# GET request
response = requests.get("https://httpbin.org/get")
print(response.status_code) # 200
print(response.text) # HTML/JSON content
print(response.headers) # Response headers
# POST request
data = {"username": "admin", "password": "secret"}
response = requests.post("https://httpbin.org/post", data=data)
# JSON response
response = requests.get("https://api.github.com/users/octocat")
user_data = response.json()
print(user_data["name"])
# Custom headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Accept": "text/html",
}
response = requests.get("https://example.com", headers=headers)
Parsing HTML với BeautifulSoup
from bs4 import BeautifulSoup
import requests
# Fetch và parse
url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
# Find elements
title = soup.title.text
print(f"Title: {title}")
# Find by tag
all_links = soup.find_all("a")
for link in all_links:
href = link.get("href")
text = link.text
print(f"{text}: {href}")
# Find by class
items = soup.find_all("div", class_="item")
# Find by id
header = soup.find(id="main-header")
# CSS selectors
elements = soup.select("div.container > p.intro")
Trích xuất dữ liệu
from bs4 import BeautifulSoup
import requests
def scrape_product_info(url: str) -> dict:
"""Scrape product information from page."""
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
return {
"title": soup.find("h1", class_="product-title").text.strip(),
"price": soup.find("span", class_="price").text.strip(),
"description": soup.find("div", class_="description").text.strip(),
"images": [img["src"] for img in soup.find_all("img", class_="product-image")]
}
# Extract all links
def extract_links(url: str) -> list:
"""Extract all links from a page."""
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
links = []
for link in soup.find_all("a", href=True):
href = link["href"]
if href.startswith("http"):
links.append(href)
return links
Session và Cookies
import requests
# Sử dụng session để maintain cookies
session = requests.Session()
# Login
login_data = {
"username": "testuser",
"password": "testpass"
}
session.post("https://example.com/login", data=login_data)
# Requests tiếp theo sẽ có cookie
profile = session.get("https://example.com/profile")
print(profile.text)
# Thêm cookies manually
session.cookies.set("session_id", "abc123")
Handling Pagination
import requests
from bs4 import BeautifulSoup
def scrape_all_pages(base_url: str, max_pages: int = 10) -> list:
"""Scrape multiple pages."""
all_items = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
response = requests.get(url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.text, "lxml")
items = soup.find_all("div", class_="item")
if not items:
break
for item in items:
all_items.append({
"title": item.find("h2").text,
"link": item.find("a")["href"]
})
print(f"Scraped page {page}, found {len(items)} items")
return all_items
Bypassing Protection
import requests
import time
import random
# Rotate User-Agents
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]
def get_with_random_ua(url: str) -> requests.Response:
headers = {"User-Agent": random.choice(USER_AGENTS)}
return requests.get(url, headers=headers)
# Rate limiting
def scrape_with_delay(urls: list, delay: float = 1.0) -> list:
results = []
for url in urls:
response = get_with_random_ua(url)
results.append(response.text)
time.sleep(delay) # Be respectful!
return results
# Retry on failure
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def get_session_with_retry() -> requests.Session:
session = requests.Session()
retry = Retry(
total=3,
backoff_factor=0.5,
status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
Ứng dụng: Subdomain Finder
import requests
from concurrent.futures import ThreadPoolExecutor
from typing import List
def check_subdomain(domain: str, subdomain: str) -> str | None:
"""Check if subdomain exists."""
url = f"http://{subdomain}.{domain}"
try:
response = requests.get(url, timeout=3)
if response.status_code < 400:
return subdomain
except requests.RequestException:
pass
return None
def find_subdomains(domain: str, wordlist: List[str], threads: int = 20) -> List[str]:
"""Find existing subdomains using wordlist."""
found = []
with ThreadPoolExecutor(max_workers=threads) as executor:
futures = [
executor.submit(check_subdomain, domain, sub)
for sub in wordlist
]
for future in futures:
result = future.result()
if result:
found.append(result)
print(f"[+] Found: {result}.{domain}")
return found
# Usage
if __name__ == "__main__":
wordlist = ["www", "mail", "ftp", "admin", "api", "dev", "test"]
subdomains = find_subdomains("example.com", wordlist)
print(f"\nFound {len(subdomains)} subdomains")
Ứng dụng: Email Harvester
import re
import requests
from bs4 import BeautifulSoup
from typing import Set
def extract_emails(text: str) -> Set[str]:
"""Extract emails using regex."""
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
return set(re.findall(pattern, text))
def harvest_emails(url: str) -> Set[str]:
"""Harvest emails from a webpage."""
try:
response = requests.get(url, timeout=10)
emails = extract_emails(response.text)
return emails
except requests.RequestException:
return set()
# Usage
emails = harvest_emails("https://example.com/contact")
print(f"Found emails: {emails}")
Best Practices
- Respect robots.txt: Kiểm tra trước khi scrape
- Rate limiting: Không spam requests
- User-Agent: Sử dụng realistic user agent
- Error handling: Handle failures gracefully
- Legal considerations: Chỉ scrape dữ liệu được phép
Bước tiếp theo
Tiếp theo trong khóa học:
- Automation Scripts: Tự động hóa tasks
- Cryptography cơ bản: Hashing, encryption
⚠️ Legal Notice: Chỉ scrape websites mà bạn có quyền. Luôn kiểm tra Terms of Service!