From 58053fd6a2a625d85c1e5bfebf135f4b56473a30 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sat, 21 Sep 2024 22:21:01 +0200
Subject: [PATCH] add data collection from txt file

---
 .gitignore                               |  3 +-
 README.md                                |  2 +-
 main.py                                  | 66 ++++++++++++++++++++++++
 multi_crawler/crawlers/web_archive.py    |  2 +-
 multi_crawler/crawlers/youtube_crawls.py |  2 +-
 multi_crawler/exports/csv_exporter.py    | 24 ++++++---
 test.py                                  | 17 ------
 test2.py                                 | 23 ---------
 8 files changed, 87 insertions(+), 52 deletions(-)
 create mode 100644 main.py
 delete mode 100644 test.py
 delete mode 100644 test2.py

diff --git a/.gitignore b/.gitignore
index b5e914e..07f6743 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-*.csv
\ No newline at end of file
+*.csv
+src_*.txt
diff --git a/README.md b/README.md
index 8b34368..262bec1 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ pip install -r requirements.txt
 
 Run the crawler
 ```bash
-python src/main.py
+python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv
 ```
 
 ## License
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..67be3ac
--- /dev/null
+++ b/main.py
@@ -0,0 +1,66 @@
+import argparse
+
+from multi_crawler import (
+    ArchiveCrawler,
+    CSVExporter,
+    Session,
+    TorSession,
+    YoutubeCrawler,
+)
+from multi_crawler.models import Audio
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser(
+        prog="multi_crawler",
+        description="Utility to crawl audio files from the internet using webarhive.org and youtube.com",
+    )
+    argparser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Input file with search terms from youtube or collection name from archive.org",
+    )
+    argparser.add_argument(
+        "--csv",
+        required=True,
+        action="store_true",
+        help="Output file in CSV format",
+    )
+    argparser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite the csv file if it exists",
+    )
+    argparser.add_argument(
+        "--file_name",
+        type=str,
+        help="Name of the output file",
+        required=False,
+    )
+    argparser.add_argument(
+        "--tor_proxy",
+        action="store_true",
+        help="Use Tor proxy to make requests on youtube",
+        default=False,
+    )
+
+    args = argparser.parse_args()
+
+    if args.csv and args.file_name is None:
+        raise ValueError("Please provide the name of the output file")
+
+    exporter = CSVExporter(args.file_name, overwrite=args.overwrite)
+
+    with open(args.input, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+
+            if line.startswith("youtube:"):
+                crawlers = YoutubeCrawler(
+                    line.split(" ", 1)[1],
+                    callback=exporter,
+                    session=TorSession if args.tor_proxy else Session,
+                )
+            else:
+                crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter)
+            crawlers.crawl()
diff --git a/multi_crawler/crawlers/web_archive.py b/multi_crawler/crawlers/web_archive.py
index 8867cbb..1cd1abe 100644
--- a/multi_crawler/crawlers/web_archive.py
+++ b/multi_crawler/crawlers/web_archive.py
@@ -60,7 +60,7 @@ def _find_url(self, item_id: str) -> None:
                 metadata["url"] = url
 
                 audio = Audio(**metadata)
-                self._callback(url, audio)
+                self._callback(audio)
 
     def crawl(self) -> None:
         """Search and extract ids"""
diff --git a/multi_crawler/crawlers/youtube_crawls.py b/multi_crawler/crawlers/youtube_crawls.py
index 400d40f..09d83d1 100644
--- a/multi_crawler/crawlers/youtube_crawls.py
+++ b/multi_crawler/crawlers/youtube_crawls.py
@@ -174,7 +174,7 @@ def crawl(self, nb_results: int = float("inf")) -> None:
                                     )
 
                                 # Call the callback function
-                                self._callback(video_url, audio)
+                                self._callback(audio)
                                 results_found += 1
                 elif "continuationItemRenderer" in content:
                     continuation_token = content["continuationItemRenderer"][
diff --git a/multi_crawler/exports/csv_exporter.py b/multi_crawler/exports/csv_exporter.py
index 0efc697..63b47ee 100644
--- a/multi_crawler/exports/csv_exporter.py
+++ b/multi_crawler/exports/csv_exporter.py
@@ -7,27 +7,35 @@
 import os
 from typing import List
 
+from ..models import Audio
+
 
 class CSVExporter:
     """Class to export the results of the crawler to a CSV file."""
 
-    def __init__(self, filename: str, *columns: List[str], overwrite: bool = False):
+    def __init__(self, filename: str, overwrite: bool = False):
         self._filename = filename
-        self._columns = columns
+        self._columns = list(Audio.model_fields.keys())
 
         # Write the columns to the CSV file
         if overwrite or not os.path.exists(self._filename):
             with open(self._filename, "w", newline="", encoding="utf-8") as f:
                 writer = csv.writer(f)
-                writer.writerow(columns)
+                writer.writerow(self._columns)
 
-    def __call__(self, *items: List[str]):
-        """Add a URL to the CSV file.
+    def __call__(self, audio: Audio):
+        """Write the information of the audio to the CSV file.
 
         Args:
-            items (List[str]): the items to add to the CSV file
+            audio (Audio): the audio object to write to the CSV file
         """
-
         with open(self._filename, "a", newline="", encoding="utf-8") as f:
             writer = csv.writer(f)
-            writer.writerow(items)
+
+            # Write the values of the audio object to the CSV file
+            writer.writerow(
+                [
+                    "" if getattr(audio, field) is None else getattr(audio, field)
+                    for field in self._columns
+                ]
+            )
diff --git a/test.py b/test.py
deleted file mode 100644
index aaf542d..0000000
--- a/test.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from multi_crawler import ArchiveCrawler, CSVExporter, Session, YoutubeCrawler
-from multi_crawler.models import Audio
-
-i = 0
-
-
-def print_url(url: str, audio):
-    global i
-    i += 1
-    print(url, i)
-
-
-exporter = CSVExporter("results.csv", overwrite=True, *list(Audio.model_fields.keys()))
-# crawlers = ArchiveCrawler("ultra-japanese-sound-collection", callback=print_url)
-crawlers = YoutubeCrawler("phonk", callback=print_url, session=Session, process=False)
-
-crawlers.crawl()
diff --git a/test2.py b/test2.py
deleted file mode 100644
index 9d80d40..0000000
--- a/test2.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import re
-
-import requests
-
-s = requests.Session()
-r = s.get("https://www.youtube.com/watch?v=o1A5hQZyuC4")
-
-
-def _get_description(content):
-    description_match = re.search(
-        r'attributedDescription":\{"content":"((?:[^"\\]|\\.)*?)"',
-        content,
-        re.DOTALL,
-    )
-
-    descr = ""
-    if description_match:
-        descr = description_match.group(1)
-
-    return descr
-
-
-print(_get_description(r.text))