#!/usr/bin/env python3
# File: ./src/scitex/web/_scraping.py
"""Web scraping utilities for extracting URLs.
``bs4`` is an optional third-party dependency (only needed when actually
scraping). Do **not** import it at module load -- doing so leaks the
``ModuleNotFoundError`` through ``scitex.web.__init__`` and through
``scitex.cli.web``, which in turn breaks ``scitex --json`` and
``scitex --help-recursive`` on any install without ``beautifulsoup4``.
See ywatanabe1989/todo#279. The import now lives inside each scraping
function, so merely importing this module is side-effect-free.
"""
import re
import urllib.parse
from typing import List, Optional, Set
import requests
from logging import getLogger
logger = getLogger(__name__)
DEFAULT_TIMEOUT = 10
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
[docs]
def get_urls(
url: str,
pattern: Optional[str] = None,
absolute: bool = True,
same_domain: bool = False,
include_external: bool = True,
*,
http_get=None,
) -> List[str]:
"""
Extract all URLs from a webpage.
Args:
url: The URL of the webpage to scrape
pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
absolute: If True, convert relative URLs to absolute URLs
same_domain: If True, only return URLs from the same domain
include_external: If True, include external links (only applies if same_domain=False)
http_get: Injected HTTP GET callable matching ``requests.get(url, *,
timeout, headers)``. Defaults to :func:`requests.get`. Tests pass
a hand-rolled fake; production code never sets this.
Returns:
List of URLs found on the page
Example:
>>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
>>> urls = get_urls('https://example.com', same_domain=True)
"""
from bs4 import BeautifulSoup # lazy: see module docstring, todo#279
if http_get is None:
http_get = requests.get
try:
logger.info(f"Fetching URLs from: {url}")
response = http_get(
url,
timeout=DEFAULT_TIMEOUT,
headers={"User-Agent": DEFAULT_USER_AGENT},
)
response.raise_for_status()
except requests.RequestException as e:
logger.error(f"Failed to fetch URL {url}: {e}")
return []
soup = BeautifulSoup(response.text, "html.parser")
urls_found: Set[str] = set()
parsed_base = urllib.parse.urlparse(url)
for link in soup.find_all("a", href=True):
href = link["href"]
if absolute:
href = urllib.parse.urljoin(url, href)
if same_domain:
parsed_href = urllib.parse.urlparse(href)
if parsed_href.netloc != parsed_base.netloc:
continue
elif not include_external:
parsed_href = urllib.parse.urlparse(href)
if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
continue
if pattern and not re.search(pattern, href):
continue
urls_found.add(href)
result = sorted(list(urls_found))
logger.info(f"Found {len(result)} URLs")
return result
[docs]
def get_image_urls(
url: str,
pattern: Optional[str] = None,
same_domain: bool = False,
*,
http_get=None,
) -> List[str]:
"""
Extract all image URLs from a webpage without downloading them.
Args:
url: The URL of the webpage to scrape
pattern: Optional regex pattern to filter image URLs
same_domain: If True, only return images from the same domain
http_get: Injected HTTP GET callable matching ``requests.get(url, *,
timeout, headers)``. Defaults to :func:`requests.get`. Tests pass
a hand-rolled fake; production code never sets this.
Returns:
List of image URLs found on the page
Note:
- SVG files are automatically skipped (vector graphics)
- Checks both 'src' and 'data-src' attributes for lazy-loaded images
Example:
>>> img_urls = get_image_urls('https://example.com')
>>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
"""
from bs4 import BeautifulSoup # lazy: see module docstring, todo#279
if http_get is None:
http_get = requests.get
try:
logger.info(f"Fetching image URLs from: {url}")
response = http_get(
url,
timeout=DEFAULT_TIMEOUT,
headers={"User-Agent": DEFAULT_USER_AGENT},
)
response.raise_for_status()
except requests.RequestException as e:
logger.error(f"Failed to fetch URL {url}: {e}")
return []
soup = BeautifulSoup(response.text, "html.parser")
image_urls: Set[str] = set()
parsed_base = urllib.parse.urlparse(url)
for img in soup.find_all("img"):
img_url = img.get("src") or img.get("data-src")
if not img_url:
continue
img_url = urllib.parse.urljoin(url, img_url)
if img_url.lower().endswith((".svg", ".svgz")):
continue
if same_domain:
parsed_img = urllib.parse.urlparse(img_url)
if parsed_img.netloc != parsed_base.netloc:
continue
if pattern and not re.search(pattern, img_url):
continue
image_urls.add(img_url)
result = sorted(list(image_urls))
logger.info(f"Found {len(result)} image URLs")
return result