Source code for scitex_web._scraping

#!/usr/bin/env python3
# File: ./src/scitex/web/_scraping.py

"""Web scraping utilities for extracting URLs.

``bs4`` is an optional third-party dependency (only needed when actually
scraping). Do **not** import it at module load -- doing so leaks the
``ModuleNotFoundError`` through ``scitex.web.__init__`` and through
``scitex.cli.web``, which in turn breaks ``scitex --json`` and
``scitex --help-recursive`` on any install without ``beautifulsoup4``.
See ywatanabe1989/todo#279. The import now lives inside each scraping
function, so merely importing this module is side-effect-free.
"""

import re
import urllib.parse
from typing import List, Optional, Set

import requests

from logging import getLogger

logger = getLogger(__name__)

DEFAULT_TIMEOUT = 10
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"



[docs]
def get_urls(
    url: str,
    pattern: Optional[str] = None,
    absolute: bool = True,
    same_domain: bool = False,
    include_external: bool = True,
    *,
    http_get=None,
) -> List[str]:
    """
    Extract all URLs from a webpage.

    Args:
        url: The URL of the webpage to scrape
        pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
        absolute: If True, convert relative URLs to absolute URLs
        same_domain: If True, only return URLs from the same domain
        include_external: If True, include external links (only applies if same_domain=False)
        http_get: Injected HTTP GET callable matching ``requests.get(url, *,
            timeout, headers)``. Defaults to :func:`requests.get`. Tests pass
            a hand-rolled fake; production code never sets this.

    Returns:
        List of URLs found on the page

    Example:
        >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
        >>> urls = get_urls('https://example.com', same_domain=True)
    """
    from bs4 import BeautifulSoup  # lazy: see module docstring, todo#279

    if http_get is None:
        http_get = requests.get

    try:
        logger.info(f"Fetching URLs from: {url}")
        response = http_get(
            url,
            timeout=DEFAULT_TIMEOUT,
            headers={"User-Agent": DEFAULT_USER_AGENT},
        )
        response.raise_for_status()
    except requests.RequestException as e:
        logger.error(f"Failed to fetch URL {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    urls_found: Set[str] = set()

    parsed_base = urllib.parse.urlparse(url)

    for link in soup.find_all("a", href=True):
        href = link["href"]

        if absolute:
            href = urllib.parse.urljoin(url, href)

        if same_domain:
            parsed_href = urllib.parse.urlparse(href)
            if parsed_href.netloc != parsed_base.netloc:
                continue
        elif not include_external:
            parsed_href = urllib.parse.urlparse(href)
            if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
                continue

        if pattern and not re.search(pattern, href):
            continue

        urls_found.add(href)

    result = sorted(list(urls_found))
    logger.info(f"Found {len(result)} URLs")
    return result




[docs]
def get_image_urls(
    url: str,
    pattern: Optional[str] = None,
    same_domain: bool = False,
    *,
    http_get=None,
) -> List[str]:
    """
    Extract all image URLs from a webpage without downloading them.

    Args:
        url: The URL of the webpage to scrape
        pattern: Optional regex pattern to filter image URLs
        same_domain: If True, only return images from the same domain
        http_get: Injected HTTP GET callable matching ``requests.get(url, *,
            timeout, headers)``. Defaults to :func:`requests.get`. Tests pass
            a hand-rolled fake; production code never sets this.

    Returns:
        List of image URLs found on the page

    Note:
        - SVG files are automatically skipped (vector graphics)
        - Checks both 'src' and 'data-src' attributes for lazy-loaded images

    Example:
        >>> img_urls = get_image_urls('https://example.com')
        >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
    """
    from bs4 import BeautifulSoup  # lazy: see module docstring, todo#279

    if http_get is None:
        http_get = requests.get

    try:
        logger.info(f"Fetching image URLs from: {url}")
        response = http_get(
            url,
            timeout=DEFAULT_TIMEOUT,
            headers={"User-Agent": DEFAULT_USER_AGENT},
        )
        response.raise_for_status()
    except requests.RequestException as e:
        logger.error(f"Failed to fetch URL {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    image_urls: Set[str] = set()

    parsed_base = urllib.parse.urlparse(url)

    for img in soup.find_all("img"):
        img_url = img.get("src") or img.get("data-src")
        if not img_url:
            continue

        img_url = urllib.parse.urljoin(url, img_url)

        if img_url.lower().endswith((".svg", ".svgz")):
            continue

        if same_domain:
            parsed_img = urllib.parse.urlparse(img_url)
            if parsed_img.netloc != parsed_base.netloc:
                continue

        if pattern and not re.search(pattern, img_url):
            continue

        image_urls.add(img_url)

    result = sorted(list(image_urls))
    logger.info(f"Found {len(result)} image URLs")
    return result