-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3b92cd1
commit 314b622
Showing
6 changed files
with
105 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from .youtube_crawls import YoutubeCrawler | ||
from .web_archive import ArchiveCrawler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
""" | ||
Base class for crawlers | ||
""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Callable | ||
|
||
|
||
class BaseCrawler(ABC): | ||
"""Base class for crawlers.""" | ||
|
||
@abstractmethod | ||
def crawl(self) -> None: | ||
""" | ||
Method to run the crawler | ||
""" | ||
|
||
@abstractmethod | ||
def __init__(self, callback: Callable, *args, **kwargs): | ||
"""Initialize the BaseCrawler object. | ||
Args: | ||
callback (Callable): the function to call with the URLs of the audio files | ||
""" | ||
super().__init__() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
""" | ||
Archive net downloader | ||
""" | ||
|
||
from typing import Callable | ||
|
||
import internetarchive | ||
|
||
from .crawlers import BaseCrawler | ||
|
||
|
||
class ArchiveCrawler(BaseCrawler): | ||
"""Class to find and return URLs of audio files from the Archive.org website.""" | ||
|
||
BASE_URL = "https://archive.org/download/" | ||
|
||
def __init__(self, collection: str, callback: Callable): | ||
"""Initialize the ArchiveDownloader object. | ||
Args: | ||
collection (str): the collections to search for mp3 files | ||
callback (Callable): the function to call with the URLs of the audio files | ||
""" | ||
|
||
self._callback = callback | ||
self._collection = collection | ||
|
||
def _find_url(self, item_id: str) -> None: | ||
"""Get mp3 files from an item | ||
Args: | ||
item_id (str): the item id | ||
""" | ||
item = internetarchive.get_item(item_id) | ||
|
||
for file in item.files: | ||
if "mp3" in file["format"].lower(): | ||
url = f"{self.BASE_URL}{item.identifier}/{file['name']}" | ||
|
||
subject = item.metadata.get("subject", []) | ||
if isinstance(subject, str): | ||
subject = [subject] | ||
|
||
metadata = {} | ||
|
||
if "title" in item.metadata: | ||
metadata["title"] = item.metadata["title"] | ||
|
||
if "album" in item.metadata: | ||
metadata["album"] = item.metadata["album"] | ||
|
||
if "genre" in item.metadata: | ||
metadata["genre"] = item.metadata["genre"] | ||
|
||
if len(subject) > 0: | ||
metadata["keywords"] = ", ".join(subject) | ||
|
||
self._callback(url, **metadata) | ||
|
||
def crawl(self) -> None: | ||
"""Search and extract ids""" | ||
|
||
search = internetarchive.search_items(f"collection:{self._collection}") | ||
|
||
if len(search) == 0: | ||
self._find_url(self._collection) | ||
else: | ||
for result in search: | ||
collection_id = result["identifier"] | ||
self._find_url(collection_id) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
requests[socks] | ||
python-dotenv | ||
stem | ||
stem | ||
internetarchive |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,16 @@ | ||
from multi_crawler import CSVExporter, Session, YoutubeCrawler | ||
|
||
from multi_crawler import ArchiveCrawler, CSVExporter, Session, YoutubeCrawler | ||
|
||
i = 0 | ||
|
||
|
||
def print_url(url: str): | ||
def print_url(url: str, **kwargs): | ||
global i | ||
i += 1 | ||
print(url, i) | ||
print(url, i, kwargs.get("title")) | ||
|
||
|
||
exporter = CSVExporter("results.csv", "URL", overwrite=True) | ||
crawlers = YoutubeCrawler("phonk", callback=exporter, session=Session) | ||
crawlers = ArchiveCrawler("ultra-japanese-sound-collection", callback=print_url) | ||
# crawlers = YoutubeCrawler("phonk", callback=exporter, session=Session) | ||
|
||
crawlers.crawl() |