add data collection from txt file

WaveGenAI · Sep 21, 2024 · 58053fd · 58053fd
1 parent 6fbe8eb
commit 58053fd
Show file tree

Hide file tree

Showing 8 changed files with 87 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-*.csv
+*.csv
+src_*.txt
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ pip install -r requirements.txt
 
 Run the crawler
 ```bash
-python src/main.py
+python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv
 ```
 
 ## License

diff --git a/main.py b/main.py
@@ -0,0 +1,66 @@
+import argparse
+
+from multi_crawler import (
+    ArchiveCrawler,
+    CSVExporter,
+    Session,
+    TorSession,
+    YoutubeCrawler,
+)
+from multi_crawler.models import Audio
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser(
+        prog="multi_crawler",
+        description="Utility to crawl audio files from the internet using webarhive.org and youtube.com",
+    )
+    argparser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Input file with search terms from youtube or collection name from archive.org",
+    )
+    argparser.add_argument(
+        "--csv",
+        required=True,
+        action="store_true",
+        help="Output file in CSV format",
+    )
+    argparser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite the csv file if it exists",
+    )
+    argparser.add_argument(
+        "--file_name",
+        type=str,
+        help="Name of the output file",
+        required=False,
+    )
+    argparser.add_argument(
+        "--tor_proxy",
+        action="store_true",
+        help="Use Tor proxy to make requests on youtube",
+        default=False,
+    )
+
+    args = argparser.parse_args()
+
+    if args.csv and args.file_name is None:
+        raise ValueError("Please provide the name of the output file")
+
+    exporter = CSVExporter(args.file_name, overwrite=args.overwrite)
+
+    with open(args.input, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+
+            if line.startswith("youtube:"):
+                crawlers = YoutubeCrawler(
+                    line.split(" ", 1)[1],
+                    callback=exporter,
+                    session=TorSession if args.tor_proxy else Session,
+                )
+            else:
+                crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter)
+            crawlers.crawl()
diff --git a/multi_crawler/crawlers/web_archive.py b/multi_crawler/crawlers/web_archive.py
@@ -60,7 +60,7 @@ def _find_url(self, item_id: str) -> None:
                 metadata["url"] = url
 
                 audio = Audio(**metadata)
-                self._callback(url, audio)
+                self._callback(audio)
 
     def crawl(self) -> None:
         """Search and extract ids"""

diff --git a/multi_crawler/crawlers/youtube_crawls.py b/multi_crawler/crawlers/youtube_crawls.py
@@ -174,7 +174,7 @@ def crawl(self, nb_results: int = float("inf")) -> None:
                                     )
 
                                 # Call the callback function
-                                self._callback(video_url, audio)
+                                self._callback(audio)
                                 results_found += 1
                 elif "continuationItemRenderer" in content:
                     continuation_token = content["continuationItemRenderer"][

diff --git a/multi_crawler/exports/csv_exporter.py b/multi_crawler/exports/csv_exporter.py
@@ -7,27 +7,35 @@
 import os
 from typing import List
 
+from ..models import Audio
+
 
 class CSVExporter:
     """Class to export the results of the crawler to a CSV file."""
 
-    def __init__(self, filename: str, *columns: List[str], overwrite: bool = False):
+    def __init__(self, filename: str, overwrite: bool = False):
         self._filename = filename
-        self._columns = columns
+        self._columns = list(Audio.model_fields.keys())
 
         # Write the columns to the CSV file
         if overwrite or not os.path.exists(self._filename):
             with open(self._filename, "w", newline="", encoding="utf-8") as f:
                 writer = csv.writer(f)
-                writer.writerow(columns)
+                writer.writerow(self._columns)
 
-    def __call__(self, *items: List[str]):
-        """Add a URL to the CSV file.
+    def __call__(self, audio: Audio):
+        """Write the information of the audio to the CSV file.
 
         Args:
-            items (List[str]): the items to add to the CSV file
+            audio (Audio): the audio object to write to the CSV file
         """
-
         with open(self._filename, "a", newline="", encoding="utf-8") as f:
             writer = csv.writer(f)
-            writer.writerow(items)
+
+            # Write the values of the audio object to the CSV file
+            writer.writerow(
+                [
+                    "" if getattr(audio, field) is None else getattr(audio, field)
+                    for field in self._columns
+                ]
+            )
diff --git a/test.py b/test.py
diff --git a/test2.py b/test2.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -159,4 +159,5 @@ cython_debug/ @@
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
-    *.csv
+    *.csv
+    src_*.txt