Source code for scitex_web.download_images

#!/usr/bin/env python3
# File: ./src/scitex/web/download_images.py

"""
Image Downloader for SciTeX.

Downloads images from URLs with minimum size filtering.

Usage:
    python -m scitex.web.download_images https://example.com
    python -m scitex.web.download_images https://example.com -o ./downloads
    python -m scitex.web.download_images https://example.com --min-size 800x600
"""

import os
import re
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Tuple

import requests
from tqdm import tqdm

# NOTE: ``bs4`` is imported lazily inside functions that actually use it.
# Importing at module load leaks ``ModuleNotFoundError`` through
# ``scitex.web.__init__`` and breaks ``scitex --json`` /
# ``scitex --help-recursive`` on installs without beautifulsoup4.
# See ywatanabe1989/todo#279.

from io import BytesIO

from scitex_dev import try_import_optional

Image = try_import_optional("PIL.Image")
PILLOW_AVAILABLE = Image is not None

from logging import getLogger

logger = getLogger(__name__)

# Configuration
DEFAULT_MIN_WIDTH = 400
DEFAULT_MIN_HEIGHT = 300
DEFAULT_TIMEOUT = 10
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"


def _get_default_download_dir() -> str:
    """Get default download directory using SCITEX_DIR if available."""
    scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
    return os.path.join(scitex_root, "web", "downloads")


def _normalize_url_for_directory(url: str) -> str:
    """Convert URL to a safe directory name."""
    parsed = urllib.parse.urlparse(url)
    domain = parsed.netloc.replace("www.", "")
    path = parsed.path.strip("/").replace("/", "-")

    normalized = f"{domain}-{path}" if path else domain
    normalized = re.sub(r"[^\w\-.]", "-", normalized)
    normalized = re.sub(r"-+", "-", normalized)
    normalized = normalized[:100].strip("-")

    return normalized


def _is_direct_image_url(url: str) -> bool:
    """Check if URL appears to be a direct image link."""
    extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
    path = urllib.parse.urlparse(url.lower()).path
    return any(path.endswith(ext) for ext in extensions)


def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
    """Extract image URLs from a webpage."""
    from bs4 import BeautifulSoup  # lazy: see module note, todo#279

    try:
        logger.info(f"Fetching page: {url}")
        response = requests.get(
            url,
            timeout=DEFAULT_TIMEOUT,
            headers={"User-Agent": DEFAULT_USER_AGENT},
        )
        response.raise_for_status()
    except requests.RequestException as e:
        logger.error(f"Failed to fetch page: {e}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    parsed_base = urllib.parse.urlparse(url)
    image_urls = set()

    for img in soup.find_all("img"):
        img_url = img.get("src") or img.get("data-src")
        if not img_url:
            continue

        img_url = urllib.parse.urljoin(url, img_url)

        if img_url.lower().endswith((".svg", ".svgz")):
            continue

        if same_domain:
            parsed_img = urllib.parse.urlparse(img_url)
            if parsed_img.netloc != parsed_base.netloc:
                continue

        image_urls.add(img_url)

    logger.info(f"Found {len(image_urls)} images on page")
    return list(image_urls)


def _download_single_image(
    img_url: str,
    output_dir: Path,
    counter: int,
    min_size: Optional[Tuple[int, int]],
) -> Optional[str]:
    """Download a single image."""
    try:
        response = requests.get(
            img_url,
            timeout=DEFAULT_TIMEOUT,
            headers={"User-Agent": DEFAULT_USER_AGENT},
        )
        response.raise_for_status()

        # Validate content-type
        content_type = response.headers.get("content-type", "")
        if not content_type.startswith("image/"):
            logger.debug(f"Skipping non-image: {content_type}")
            return None

        # Check dimensions
        if min_size and PILLOW_AVAILABLE:
            try:
                img = Image.open(BytesIO(response.content))
                width, height = img.size
                if width < min_size[0] or height < min_size[1]:
                    logger.debug(
                        f"Skipping small image: {width}x{height} "
                        f"(min: {min_size[0]}x{min_size[1]})"
                    )
                    return None
            except Exception:
                pass

        # Determine extension
        ext = "jpg"
        if PILLOW_AVAILABLE:
            try:
                img = Image.open(BytesIO(response.content))
                fmt = img.format.lower() if img.format else "jpeg"
                ext = "jpg" if fmt == "jpeg" else fmt
            except Exception:
                pass
        elif "png" in content_type:
            ext = "png"
        elif "gif" in content_type:
            ext = "gif"
        elif "webp" in content_type:
            ext = "webp"

        filename = f"{counter:04d}.{ext}"
        filepath = output_dir / filename

        with open(filepath, "wb") as f:
            f.write(response.content)

        logger.info(f"Downloaded: {filename}")
        return str(filepath)

    except Exception as e:
        logger.warning(f"Error downloading {img_url}: {e}")
        return None


[docs] def download_images( url: str, output_dir: Optional[str] = None, min_size: Optional[Tuple[int, int]] = None, max_workers: int = 5, same_domain: bool = False, ) -> List[str]: """ Download images from a URL. Args: url: Webpage URL or direct image URL output_dir: Output directory (default: $SCITEX_DIR/web/downloads) min_size: Minimum (width, height) to filter small images (default: 400x300) max_workers: Concurrent download threads same_domain: Only download images from the same domain Returns: List of downloaded file paths Example: >>> paths = download_images("https://example.com") >>> paths = download_images("https://example.com/photo.jpg") >>> paths = download_images("https://example.com", min_size=(800, 600)) """ if not PILLOW_AVAILABLE: logger.warning("Pillow not available. Size filtering disabled.") min_size = None elif min_size is None: min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT) # Setup output directory if output_dir is None: output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR") if output_dir is None: output_dir = _get_default_download_dir() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") normalized = _normalize_url_for_directory(url) output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images" output_path.mkdir(parents=True, exist_ok=True) logger.info(f"Output directory: {output_path}") # Get image URLs if _is_direct_image_url(url): image_urls = [url] logger.info("Direct image URL detected") else: image_urls = _extract_image_urls(url, same_domain=same_domain) if not image_urls: logger.warning("No images found") return [] # Download concurrently downloaded = [] counter = [1] def download_with_counter(img_url: str) -> Optional[str]: idx = counter[0] counter[0] += 1 return _download_single_image(img_url, output_path, idx, min_size) with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(download_with_counter, u): u for u in image_urls} for future in tqdm( as_completed(futures), total=len(image_urls), desc="Downloading" ): result = future.result() if result: downloaded.append(result) logger.info(f"Downloaded {len(downloaded)} images to {output_path}") return downloaded
def main(): """CLI entry point.""" import argparse parser = argparse.ArgumentParser( description="Download images from URL", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python -m scitex.web.download_images https://example.com python -m scitex.web.download_images https://example.com -o ./downloads python -m scitex.web.download_images https://example.com --min-size 800x600 python -m scitex.web.download_images https://example.com --no-min-size """, ) parser.add_argument("url", help="URL to download images from") parser.add_argument("-o", "--output", help="Output directory") parser.add_argument( "--min-size", default="400x300", help="Minimum size WIDTHxHEIGHT (default: 400x300)", ) parser.add_argument( "--no-min-size", action="store_true", help="Disable size filtering", ) parser.add_argument( "--same-domain", action="store_true", help="Only download from same domain", ) parser.add_argument( "--workers", type=int, default=5, help="Concurrent downloads (default: 5)", ) args = parser.parse_args() min_size = None if not args.no_min_size and args.min_size: w, h = map(int, args.min_size.split("x")) min_size = (w, h) paths = download_images( args.url, output_dir=args.output, min_size=min_size, max_workers=args.workers, same_domain=args.same_domain, ) print(f"\nDownloaded {len(paths)} images:") for p in paths: print(f" {p}") if __name__ == "__main__": main()