add webarchive crawler

WaveGenAI · Sep 21, 2024 · 314b622 · 314b622
1 parent 3b92cd1
commit 314b622
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 7 deletions.
diff --git a/multi_crawler/crawlers/__init__.py b/multi_crawler/crawlers/__init__.py
@@ -1 +1,2 @@
 from .youtube_crawls import YoutubeCrawler
+from .web_archive import ArchiveCrawler
diff --git a/multi_crawler/crawlers/crawlers.py b/multi_crawler/crawlers/crawlers.py
@@ -0,0 +1,25 @@
+""" 
+Base class for crawlers
+"""
+
+from abc import ABC, abstractmethod
+from typing import Callable
+
+
+class BaseCrawler(ABC):
+    """Base class for crawlers."""
+
+    @abstractmethod
+    def crawl(self) -> None:
+        """
+        Method to run the crawler
+        """
+
+    @abstractmethod
+    def __init__(self, callback: Callable, *args, **kwargs):
+        """Initialize the BaseCrawler object.
+
+        Args:
+            callback (Callable): the function to call with the URLs of the audio files
+        """
+        super().__init__()
diff --git a/multi_crawler/crawlers/web_archive.py b/multi_crawler/crawlers/web_archive.py
@@ -0,0 +1,70 @@
+""" 
+Archive net downloader
+"""
+
+from typing import Callable
+
+import internetarchive
+
+from .crawlers import BaseCrawler
+
+
+class ArchiveCrawler(BaseCrawler):
+    """Class to find and return URLs of audio files from the Archive.org website."""
+
+    BASE_URL = "https://archive.org/download/"
+
+    def __init__(self, collection: str, callback: Callable):
+        """Initialize the ArchiveDownloader object.
+
+        Args:
+            collection (str): the collections to search for mp3 files
+            callback (Callable): the function to call with the URLs of the audio files
+        """
+
+        self._callback = callback
+        self._collection = collection
+
+    def _find_url(self, item_id: str) -> None:
+        """Get mp3 files from an item
+
+        Args:
+            item_id (str): the item id
+        """
+        item = internetarchive.get_item(item_id)
+
+        for file in item.files:
+            if "mp3" in file["format"].lower():
+                url = f"{self.BASE_URL}{item.identifier}/{file['name']}"
+
+                subject = item.metadata.get("subject", [])
+                if isinstance(subject, str):
+                    subject = [subject]
+
+                metadata = {}
+
+                if "title" in item.metadata:
+                    metadata["title"] = item.metadata["title"]
+
+                if "album" in item.metadata:
+                    metadata["album"] = item.metadata["album"]
+
+                if "genre" in item.metadata:
+                    metadata["genre"] = item.metadata["genre"]
+
+                if len(subject) > 0:
+                    metadata["keywords"] = ", ".join(subject)
+
+                self._callback(url, **metadata)
+
+    def crawl(self) -> None:
+        """Search and extract ids"""
+
+        search = internetarchive.search_items(f"collection:{self._collection}")
+
+        if len(search) == 0:
+            self._find_url(self._collection)
+        else:
+            for result in search:
+                collection_id = result["identifier"]
+                self._find_url(collection_id)
diff --git a/multi_crawler/crawlers/youtube_crawls.py b/multi_crawler/crawlers/youtube_crawls.py
@@ -2,9 +2,10 @@
 from typing import Any, Callable, Dict, List, Sequence
 
 from ..session import Session
+from .crawlers import BaseCrawler
 
 
-class YoutubeCrawler:
+class YoutubeCrawler(BaseCrawler):
     """
     Find and return URLs of Youtube videos based on search terms.
     """

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 requests[socks]
 python-dotenv
-stem
+stem
+internetarchive
diff --git a/test.py b/test.py
@@ -1,16 +1,16 @@
-from multi_crawler import CSVExporter, Session, YoutubeCrawler
-
+from multi_crawler import ArchiveCrawler, CSVExporter, Session, YoutubeCrawler
 
 i = 0
 
 
-def print_url(url: str):
+def print_url(url: str, **kwargs):
     global i
     i += 1
-    print(url, i)
+    print(url, i, kwargs.get("title"))
 
 
 exporter = CSVExporter("results.csv", "URL", overwrite=True)
-crawlers = YoutubeCrawler("phonk", callback=exporter, session=Session)
+crawlers = ArchiveCrawler("ultra-japanese-sound-collection", callback=print_url)
+# crawlers = YoutubeCrawler("phonk", callback=exporter, session=Session)
 
 crawlers.crawl()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .youtube_crawls import YoutubeCrawler
		from .web_archive import ArchiveCrawler