Source code for scitex_web.download_images

#!/usr/bin/env python3
# File: ./src/scitex/web/download_images.py

"""
Image Downloader for SciTeX.

Downloads images from URLs with minimum size filtering.

Usage:
    python -m scitex.web.download_images https://example.com
    python -m scitex.web.download_images https://example.com -o ./downloads
    python -m scitex.web.download_images https://example.com --min-size 800x600
"""

import os
import re
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Tuple

import requests
from tqdm import tqdm

# NOTE: ``bs4`` is imported lazily inside functions that actually use it.
# Importing at module load leaks ``ModuleNotFoundError`` through
# ``scitex.web.__init__`` and breaks ``scitex --json`` /
# ``scitex --help-recursive`` on installs without beautifulsoup4.
# See ywatanabe1989/todo#279.

from io import BytesIO

from scitex_dev import try_import_optional

Image = try_import_optional("PIL.Image")
PILLOW_AVAILABLE = Image is not None

from logging import getLogger

logger = getLogger(__name__)

# Configuration
DEFAULT_MIN_WIDTH = 400
DEFAULT_MIN_HEIGHT = 300
DEFAULT_TIMEOUT = 10
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"


def _get_default_download_dir() -> str:
    """Get default download directory using SCITEX_DIR if available."""
    scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
    return os.path.join(scitex_root, "web", "downloads")


def _normalize_url_for_directory(url: str) -> str:
    """Convert URL to a safe directory name."""
    parsed = urllib.parse.urlparse(url)
    domain = parsed.netloc.replace("www.", "")
    path = parsed.path.strip("/").replace("/", "-")

    normalized = f"{domain}-{path}" if path else domain
    normalized = re.sub(r"[^\w\-.]", "-", normalized)
    normalized = re.sub(r"-+", "-", normalized)
    normalized = normalized[:100].strip("-")

    return normalized


def _is_direct_image_url(url: str) -> bool:
    """Check if URL appears to be a direct image link."""
    extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
    path = urllib.parse.urlparse(url.lower()).path
    return any(path.endswith(ext) for ext in extensions)


def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
    """Extract image URLs from a webpage."""
    from bs4 import BeautifulSoup  # lazy: see module note, todo#279

    try:
        logger.info(f"Fetching page: {url}")
        response = requests.get(
            url,
            timeout=DEFAULT_TIMEOUT,
            headers={"User-Agent": DEFAULT_USER_AGENT},
        )
        response.raise_for_status()
    except requests.RequestException as e:
        logger.error(f"Failed to fetch page: {e}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    parsed_base = urllib.parse.urlparse(url)
    image_urls = set()

    for img in soup.find_all("img"):
        img_url = img.get("src") or img.get("data-src")
        if not img_url:
            continue

        img_url = urllib.parse.urljoin(url, img_url)

        if img_url.lower().endswith((".svg", ".svgz")):
            continue

        if same_domain:
            parsed_img = urllib.parse.urlparse(img_url)
            if parsed_img.netloc != parsed_base.netloc:
                continue

        image_urls.add(img_url)

    logger.info(f"Found {len(image_urls)} images on page")
    return list(image_urls)


def _download_single_image(
    img_url: str,
    output_dir: Path,
    counter: int,
    min_size: Optional[Tuple[int, int]],
) -> Optional[str]:
    """Download a single image."""
    try:
        response = requests.get(
            img_url,
            timeout=DEFAULT_TIMEOUT,
            headers={"User-Agent": DEFAULT_USER_AGENT},
        )
        response.raise_for_status()

        # Validate content-type
        content_type = response.headers.get("content-type", "")
        if not content_type.startswith("image/"):
            logger.debug(f"Skipping non-image: {content_type}")
            return None

        # Check dimensions
        if min_size and PILLOW_AVAILABLE:
            try:
                img = Image.open(BytesIO(response.content))
                width, height = img.size
                if width < min_size[0] or height < min_size[1]:
                    logger.debug(
                        f"Skipping small image: {width}x{height} "
                        f"(min: {min_size[0]}x{min_size[1]})"
                    )
                    return None
            except Exception:
                pass

        # Determine extension
        ext = "jpg"
        if PILLOW_AVAILABLE:
            try:
                img = Image.open(BytesIO(response.content))
                fmt = img.format.lower() if img.format else "jpeg"
                ext = "jpg" if fmt == "jpeg" else fmt
            except Exception:
                pass
        elif "png" in content_type:
            ext = "png"
        elif "gif" in content_type:
            ext = "gif"
        elif "webp" in content_type:
            ext = "webp"

        filename = f"{counter:04d}.{ext}"
        filepath = output_dir / filename

        with open(filepath, "wb") as f:
            f.write(response.content)

        logger.info(f"Downloaded: {filename}")
        return str(filepath)

    except Exception as e:
        logger.warning(f"Error downloading {img_url}: {e}")
        return None



[docs]
def download_images(
    url: str,
    output_dir: Optional[str] = None,
    min_size: Optional[Tuple[int, int]] = None,
    max_workers: int = 5,
    same_domain: bool = False,
) -> List[str]:
    """
    Download images from a URL.

    Args:
        url: Webpage URL or direct image URL
        output_dir: Output directory (default: $SCITEX_DIR/web/downloads)
        min_size: Minimum (width, height) to filter small images (default: 400x300)
        max_workers: Concurrent download threads
        same_domain: Only download images from the same domain

    Returns:
        List of downloaded file paths

    Example:
        >>> paths = download_images("https://example.com")
        >>> paths = download_images("https://example.com/photo.jpg")
        >>> paths = download_images("https://example.com", min_size=(800, 600))
    """
    if not PILLOW_AVAILABLE:
        logger.warning("Pillow not available. Size filtering disabled.")
        min_size = None
    elif min_size is None:
        min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT)

    # Setup output directory
    if output_dir is None:
        output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
        if output_dir is None:
            output_dir = _get_default_download_dir()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    normalized = _normalize_url_for_directory(url)
    output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images"
    output_path.mkdir(parents=True, exist_ok=True)

    logger.info(f"Output directory: {output_path}")

    # Get image URLs
    if _is_direct_image_url(url):
        image_urls = [url]
        logger.info("Direct image URL detected")
    else:
        image_urls = _extract_image_urls(url, same_domain=same_domain)

    if not image_urls:
        logger.warning("No images found")
        return []

    # Download concurrently
    downloaded = []
    counter = [1]

    def download_with_counter(img_url: str) -> Optional[str]:
        idx = counter[0]
        counter[0] += 1
        return _download_single_image(img_url, output_path, idx, min_size)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_with_counter, u): u for u in image_urls}

        for future in tqdm(
            as_completed(futures), total=len(image_urls), desc="Downloading"
        ):
            result = future.result()
            if result:
                downloaded.append(result)

    logger.info(f"Downloaded {len(downloaded)} images to {output_path}")
    return downloaded



def main():
    """CLI entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Download images from URL",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python -m scitex.web.download_images https://example.com
  python -m scitex.web.download_images https://example.com -o ./downloads
  python -m scitex.web.download_images https://example.com --min-size 800x600
  python -m scitex.web.download_images https://example.com --no-min-size
        """,
    )
    parser.add_argument("url", help="URL to download images from")
    parser.add_argument("-o", "--output", help="Output directory")
    parser.add_argument(
        "--min-size",
        default="400x300",
        help="Minimum size WIDTHxHEIGHT (default: 400x300)",
    )
    parser.add_argument(
        "--no-min-size",
        action="store_true",
        help="Disable size filtering",
    )
    parser.add_argument(
        "--same-domain",
        action="store_true",
        help="Only download from same domain",
    )
    parser.add_argument(
        "--workers",
        type=int,
        default=5,
        help="Concurrent downloads (default: 5)",
    )

    args = parser.parse_args()

    min_size = None
    if not args.no_min_size and args.min_size:
        w, h = map(int, args.min_size.split("x"))
        min_size = (w, h)

    paths = download_images(
        args.url,
        output_dir=args.output,
        min_size=min_size,
        max_workers=args.workers,
        same_domain=args.same_domain,
    )

    print(f"\nDownloaded {len(paths)} images:")
    for p in paths:
        print(f"  {p}")


if __name__ == "__main__":
    main()