Source code for wikitransp.scraper.scrape

from __future__ import annotations

from signal import SIGINT
from sys import stderr

from .check_png import filter_tsv_rows
from .decompress_utils import decompress_gz_files
from .download_utils import download_data_url, download_dataset

__all__ = ["scrape_images"]


[docs]def scrape_images( sample: bool = False, resume_at: str | None = None, resume_after: str | None = None, decompress_tsv: bool = False, fetch_async: bool = True, ) -> None: """ Build a local dataset by scanning the WIT datatset (or a small sample of it) for suitable PNGs. Note: only pass one of ``resume_at`` or ``resume_after``. Args: sample : Whether to only scrape the 1% sample dataset resume_at : The image URL to resume at (if scraping was interrupted). resume_at : The image URL to resume after (if scraping was interrupted). decompress_tsv : Whether to decompress gzipped TSVs before filtering (not necessary, and will increase dataset file size on disk). """ try: dataset_files = download_dataset(sample=sample) if decompress_tsv: decompressed_files = decompress_gz_files(paths=dataset_files) filtered_tsv = filter_tsv_rows( input_tsv_files=decompressed_files if decompress_tsv else dataset_files, resume_at=resume_at, fetch_async=fetch_async, ) except KeyboardInterrupt: raise SystemExit(SIGINT.value) # exit code 2 except Exception as exc: raise exc from SystemExit(10) return