Source code for scitex_web._summarize_url

#!./env/bin/python3
# -*- coding: utf-8 -*-
# Time-stamp: "2024-07-29 21:43:30 (ywatanabe)"
# ./src/scitex/web/_crawl.py


import json
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
from pprint import pprint

import requests
from bs4 import BeautifulSoup
from scitex_dev import try_import_optional
from tqdm import tqdm

Document = try_import_optional(
    "readability", "Document", extra="readability", pkg="scitex-web"
)
if Document is None:
    # Older readability-lxml layouts (pre-0.8) shipped ``Document`` under
    # ``readability.readability`` instead of the package root. Fall back
    # transparently so callers don't have to care.
    Document = try_import_optional(
        "readability.readability",
        "Document",
        extra="readability",
        pkg="scitex-web",
    )

import re

# def crawl_url(url, max_depth=1):
#     print("\nCrawling...")
#     visited = set()
#     to_visit = [(url, 0)]
#     contents = {}

#     while to_visit:
#         current_url, depth = to_visit.pop(0)
#         if current_url in visited or depth > max_depth:
#             continue

#         try:
#             response = requests.get(current_url)
#             if response.status_code == 200:
#                 visited.add(current_url)
#                 contents[current_url] = response.text
#                 soup = BeautifulSoup(response.text, "html.parser")

#                 for link in soup.find_all("a", href=True):
#                     absolute_link = urllib.parse.urljoin(
#                         current_url, link["href"]
#                     )
#                     if absolute_link not in visited:
#                         to_visit.append((absolute_link, depth + 1))

#         except requests.RequestException:
#             pass

#     return visited, contents


_DOCUMENT_DEFAULT = object()  # sentinel: "caller did not specify"


def extract_main_content(html, *, document_cls=_DOCUMENT_DEFAULT):
    """Extract readable text from raw HTML.

    When ``document_cls`` is omitted it uses the module-level ``Document``
    (the readability extractor). Pass ``document_cls=None`` explicitly to
    force the tag-stripping fallback, or pass a different extractor class.
    Callers normally omit it.
    """
    if document_cls is _DOCUMENT_DEFAULT:
        document_cls = Document

    if document_cls is None:
        # Fallback: just strip HTML tags
        content = re.sub("<[^<]+?>", "", html)
        content = " ".join(content.split())
        return content[:5_000]  # Limit to first 5000 chars

    doc = document_cls(html)
    content = doc.summary()
    # Remove HTML tags
    content = re.sub("<[^<]+?>", "", content)
    # Remove extra whitespace
    content = " ".join(content.split())
    return content


[docs] def crawl_url(url, max_depth=1): print("\nCrawling...") visited = set() to_visit = [(url, 0)] contents = {} while to_visit: current_url, depth = to_visit.pop(0) if current_url in visited or depth > max_depth: continue try: response = requests.get(current_url) if response.status_code == 200: visited.add(current_url) main_content = extract_main_content(response.text) contents[current_url] = main_content soup = BeautifulSoup(response.text, "html.parser") for link in soup.find_all("a", href=True): absolute_link = urllib.parse.urljoin(current_url, link["href"]) if absolute_link not in visited: to_visit.append((absolute_link, depth + 1)) except requests.RequestException: pass return visited, contents
[docs] def crawl_to_json(start_url, *, crawler=None, genai_factory=None): """Crawl ``start_url`` and summarize each page into a JSON document. ``crawler`` defaults to :func:`crawl_url`; ``genai_factory`` defaults to :func:`_get_genai`. Both are injectable so callers can supply a real local crawler / a deterministic summarizer without monkey-patching. """ if crawler is None: crawler = crawl_url if genai_factory is None: genai_factory = _get_genai if not start_url.startswith("http"): start_url = "https://" + start_url crawled_urls, contents = crawler(start_url) print("\nSummalizing as json...") def process_url(url): llm = genai_factory("gpt-4o-mini") return { "url": url, "content": llm(f"Summarize this page in 1 line:\n\n{contents[url]}"), } with ThreadPoolExecutor() as executor: future_to_url = {executor.submit(process_url, url): url for url in crawled_urls} crawled_pages = [] for future in tqdm( as_completed(future_to_url), total=len(crawled_urls), desc="Processing URLs", ): crawled_pages.append(future.result()) result = {"start_url": start_url, "crawled_pages": crawled_pages} return json.dumps(result, indent=2)
def _get_genai(model: str): """Lazy-load scitex.ai.GenAI; raise a clear ImportError if scitex isn't installed.""" try: from scitex_ai import GenAI # type: ignore except ImportError as e: raise ImportError( "summarize_url / summarize_all require the 'scitex' umbrella package " "for scitex.ai.GenAI. Install with: pip install scitex" ) from e return GenAI(model) def summarize_all(json_contents, *, genai_factory=None): """Summarize a crawled-JSON document into bullet points via an LLM. ``genai_factory`` defaults to :func:`_get_genai`; injectable so callers can pass a deterministic summarizer. """ if genai_factory is None: genai_factory = _get_genai llm = genai_factory("gpt-4o-mini") out = llm(f"Summarize this json file with 5 bullet points:\n\n{json_contents}") return out
[docs] def summarize_url(start_url, *, crawl_fn=None, summarize_fn=None): """Crawl ``start_url`` then summarize it. ``crawl_fn`` defaults to :func:`crawl_to_json`; ``summarize_fn`` defaults to :func:`summarize_all`. Both are injectable so the composition can be exercised with deterministic stand-ins. """ if crawl_fn is None: crawl_fn = crawl_to_json if summarize_fn is None: summarize_fn = summarize_all json_result = crawl_fn(start_url) ground_summary = summarize_fn(json_result) pprint(ground_summary) return ground_summary, json_result
main = summarize_url if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="") parser.add_argument("--url", "-u", type=str, help="(default: %(default)s)") args = parser.parse_args() print(f"Args: {args}") main(args.url)