From 58053fd6a2a625d85c1e5bfebf135f4b56473a30 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sat, 21 Sep 2024 22:21:01 +0200 Subject: [PATCH] add data collection from txt file --- .gitignore | 3 +- README.md | 2 +- main.py | 66 ++++++++++++++++++++++++ multi_crawler/crawlers/web_archive.py | 2 +- multi_crawler/crawlers/youtube_crawls.py | 2 +- multi_crawler/exports/csv_exporter.py | 24 ++++++--- test.py | 17 ------ test2.py | 23 --------- 8 files changed, 87 insertions(+), 52 deletions(-) create mode 100644 main.py delete mode 100644 test.py delete mode 100644 test2.py diff --git a/.gitignore b/.gitignore index b5e914e..07f6743 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -*.csv \ No newline at end of file +*.csv +src_*.txt diff --git a/README.md b/README.md index 8b34368..262bec1 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ pip install -r requirements.txt Run the crawler ```bash -python src/main.py +python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv ``` ## License diff --git a/main.py b/main.py new file mode 100644 index 0000000..67be3ac --- /dev/null +++ b/main.py @@ -0,0 +1,66 @@ +import argparse + +from multi_crawler import ( + ArchiveCrawler, + CSVExporter, + Session, + TorSession, + YoutubeCrawler, +) +from multi_crawler.models import Audio + +if __name__ == "__main__": + argparser = argparse.ArgumentParser( + prog="multi_crawler", + description="Utility to crawl audio files from the internet using webarhive.org and youtube.com", + ) + argparser.add_argument( + "--input", + type=str, + required=True, + help="Input file with search terms from youtube or collection name from archive.org", + ) + argparser.add_argument( + "--csv", + required=True, + action="store_true", + help="Output file in CSV format", + ) + argparser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite the csv file if it exists", + ) + argparser.add_argument( + "--file_name", + type=str, + help="Name of the output file", + required=False, + ) + argparser.add_argument( + "--tor_proxy", + action="store_true", + help="Use Tor proxy to make requests on youtube", + default=False, + ) + + args = argparser.parse_args() + + if args.csv and args.file_name is None: + raise ValueError("Please provide the name of the output file") + + exporter = CSVExporter(args.file_name, overwrite=args.overwrite) + + with open(args.input, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + + if line.startswith("youtube:"): + crawlers = YoutubeCrawler( + line.split(" ", 1)[1], + callback=exporter, + session=TorSession if args.tor_proxy else Session, + ) + else: + crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter) + crawlers.crawl() diff --git a/multi_crawler/crawlers/web_archive.py b/multi_crawler/crawlers/web_archive.py index 8867cbb..1cd1abe 100644 --- a/multi_crawler/crawlers/web_archive.py +++ b/multi_crawler/crawlers/web_archive.py @@ -60,7 +60,7 @@ def _find_url(self, item_id: str) -> None: metadata["url"] = url audio = Audio(**metadata) - self._callback(url, audio) + self._callback(audio) def crawl(self) -> None: """Search and extract ids""" diff --git a/multi_crawler/crawlers/youtube_crawls.py b/multi_crawler/crawlers/youtube_crawls.py index 400d40f..09d83d1 100644 --- a/multi_crawler/crawlers/youtube_crawls.py +++ b/multi_crawler/crawlers/youtube_crawls.py @@ -174,7 +174,7 @@ def crawl(self, nb_results: int = float("inf")) -> None: ) # Call the callback function - self._callback(video_url, audio) + self._callback(audio) results_found += 1 elif "continuationItemRenderer" in content: continuation_token = content["continuationItemRenderer"][ diff --git a/multi_crawler/exports/csv_exporter.py b/multi_crawler/exports/csv_exporter.py index 0efc697..63b47ee 100644 --- a/multi_crawler/exports/csv_exporter.py +++ b/multi_crawler/exports/csv_exporter.py @@ -7,27 +7,35 @@ import os from typing import List +from ..models import Audio + class CSVExporter: """Class to export the results of the crawler to a CSV file.""" - def __init__(self, filename: str, *columns: List[str], overwrite: bool = False): + def __init__(self, filename: str, overwrite: bool = False): self._filename = filename - self._columns = columns + self._columns = list(Audio.model_fields.keys()) # Write the columns to the CSV file if overwrite or not os.path.exists(self._filename): with open(self._filename, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) - writer.writerow(columns) + writer.writerow(self._columns) - def __call__(self, *items: List[str]): - """Add a URL to the CSV file. + def __call__(self, audio: Audio): + """Write the information of the audio to the CSV file. Args: - items (List[str]): the items to add to the CSV file + audio (Audio): the audio object to write to the CSV file """ - with open(self._filename, "a", newline="", encoding="utf-8") as f: writer = csv.writer(f) - writer.writerow(items) + + # Write the values of the audio object to the CSV file + writer.writerow( + [ + "" if getattr(audio, field) is None else getattr(audio, field) + for field in self._columns + ] + ) diff --git a/test.py b/test.py deleted file mode 100644 index aaf542d..0000000 --- a/test.py +++ /dev/null @@ -1,17 +0,0 @@ -from multi_crawler import ArchiveCrawler, CSVExporter, Session, YoutubeCrawler -from multi_crawler.models import Audio - -i = 0 - - -def print_url(url: str, audio): - global i - i += 1 - print(url, i) - - -exporter = CSVExporter("results.csv", overwrite=True, *list(Audio.model_fields.keys())) -# crawlers = ArchiveCrawler("ultra-japanese-sound-collection", callback=print_url) -crawlers = YoutubeCrawler("phonk", callback=print_url, session=Session, process=False) - -crawlers.crawl() diff --git a/test2.py b/test2.py deleted file mode 100644 index 9d80d40..0000000 --- a/test2.py +++ /dev/null @@ -1,23 +0,0 @@ -import re - -import requests - -s = requests.Session() -r = s.get("https://www.youtube.com/watch?v=o1A5hQZyuC4") - - -def _get_description(content): - description_match = re.search( - r'attributedDescription":\{"content":"((?:[^"\\]|\\.)*?)"', - content, - re.DOTALL, - ) - - descr = "" - if description_match: - descr = description_match.group(1) - - return descr - - -print(_get_description(r.text))