Skip to content

Commit

Permalink
add webarchive crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Sep 21, 2024
1 parent 3b92cd1 commit 314b622
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 7 deletions.
1 change: 1 addition & 0 deletions multi_crawler/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .youtube_crawls import YoutubeCrawler
from .web_archive import ArchiveCrawler
25 changes: 25 additions & 0 deletions multi_crawler/crawlers/crawlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Base class for crawlers
"""

from abc import ABC, abstractmethod
from typing import Callable


class BaseCrawler(ABC):
"""Base class for crawlers."""

@abstractmethod
def crawl(self) -> None:
"""
Method to run the crawler
"""

@abstractmethod
def __init__(self, callback: Callable, *args, **kwargs):
"""Initialize the BaseCrawler object.
Args:
callback (Callable): the function to call with the URLs of the audio files
"""
super().__init__()
70 changes: 70 additions & 0 deletions multi_crawler/crawlers/web_archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Archive net downloader
"""

from typing import Callable

import internetarchive

from .crawlers import BaseCrawler


class ArchiveCrawler(BaseCrawler):
"""Class to find and return URLs of audio files from the Archive.org website."""

BASE_URL = "https://archive.org/download/"

def __init__(self, collection: str, callback: Callable):
"""Initialize the ArchiveDownloader object.
Args:
collection (str): the collections to search for mp3 files
callback (Callable): the function to call with the URLs of the audio files
"""

self._callback = callback
self._collection = collection

def _find_url(self, item_id: str) -> None:
"""Get mp3 files from an item
Args:
item_id (str): the item id
"""
item = internetarchive.get_item(item_id)

for file in item.files:
if "mp3" in file["format"].lower():
url = f"{self.BASE_URL}{item.identifier}/{file['name']}"

subject = item.metadata.get("subject", [])
if isinstance(subject, str):
subject = [subject]

metadata = {}

if "title" in item.metadata:
metadata["title"] = item.metadata["title"]

if "album" in item.metadata:
metadata["album"] = item.metadata["album"]

if "genre" in item.metadata:
metadata["genre"] = item.metadata["genre"]

if len(subject) > 0:
metadata["keywords"] = ", ".join(subject)

self._callback(url, **metadata)

def crawl(self) -> None:
"""Search and extract ids"""

search = internetarchive.search_items(f"collection:{self._collection}")

if len(search) == 0:
self._find_url(self._collection)
else:
for result in search:
collection_id = result["identifier"]
self._find_url(collection_id)
3 changes: 2 additions & 1 deletion multi_crawler/crawlers/youtube_crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
from typing import Any, Callable, Dict, List, Sequence

from ..session import Session
from .crawlers import BaseCrawler


class YoutubeCrawler:
class YoutubeCrawler(BaseCrawler):
"""
Find and return URLs of Youtube videos based on search terms.
"""
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
requests[socks]
python-dotenv
stem
stem
internetarchive
10 changes: 5 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from multi_crawler import CSVExporter, Session, YoutubeCrawler

from multi_crawler import ArchiveCrawler, CSVExporter, Session, YoutubeCrawler

i = 0


def print_url(url: str):
def print_url(url: str, **kwargs):
global i
i += 1
print(url, i)
print(url, i, kwargs.get("title"))


exporter = CSVExporter("results.csv", "URL", overwrite=True)
crawlers = YoutubeCrawler("phonk", callback=exporter, session=Session)
crawlers = ArchiveCrawler("ultra-japanese-sound-collection", callback=print_url)
# crawlers = YoutubeCrawler("phonk", callback=exporter, session=Session)

crawlers.crawl()

0 comments on commit 314b622

Please sign in to comment.