Skip to content

Commit

Permalink
add data collection from txt file
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Sep 21, 2024
1 parent 6fbe8eb commit 58053fd
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 52 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

*.csv
*.csv
src_*.txt
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pip install -r requirements.txt

Run the crawler
```bash
python src/main.py
python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv
```

## License
Expand Down
66 changes: 66 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import argparse

from multi_crawler import (
ArchiveCrawler,
CSVExporter,
Session,
TorSession,
YoutubeCrawler,
)
from multi_crawler.models import Audio

if __name__ == "__main__":
argparser = argparse.ArgumentParser(
prog="multi_crawler",
description="Utility to crawl audio files from the internet using webarhive.org and youtube.com",
)
argparser.add_argument(
"--input",
type=str,
required=True,
help="Input file with search terms from youtube or collection name from archive.org",
)
argparser.add_argument(
"--csv",
required=True,
action="store_true",
help="Output file in CSV format",
)
argparser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite the csv file if it exists",
)
argparser.add_argument(
"--file_name",
type=str,
help="Name of the output file",
required=False,
)
argparser.add_argument(
"--tor_proxy",
action="store_true",
help="Use Tor proxy to make requests on youtube",
default=False,
)

args = argparser.parse_args()

if args.csv and args.file_name is None:
raise ValueError("Please provide the name of the output file")

exporter = CSVExporter(args.file_name, overwrite=args.overwrite)

with open(args.input, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()

if line.startswith("youtube:"):
crawlers = YoutubeCrawler(
line.split(" ", 1)[1],
callback=exporter,
session=TorSession if args.tor_proxy else Session,
)
else:
crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter)
crawlers.crawl()
2 changes: 1 addition & 1 deletion multi_crawler/crawlers/web_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _find_url(self, item_id: str) -> None:
metadata["url"] = url

audio = Audio(**metadata)
self._callback(url, audio)
self._callback(audio)

def crawl(self) -> None:
"""Search and extract ids"""
Expand Down
2 changes: 1 addition & 1 deletion multi_crawler/crawlers/youtube_crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def crawl(self, nb_results: int = float("inf")) -> None:
)

# Call the callback function
self._callback(video_url, audio)
self._callback(audio)
results_found += 1
elif "continuationItemRenderer" in content:
continuation_token = content["continuationItemRenderer"][
Expand Down
24 changes: 16 additions & 8 deletions multi_crawler/exports/csv_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,35 @@
import os
from typing import List

from ..models import Audio


class CSVExporter:
"""Class to export the results of the crawler to a CSV file."""

def __init__(self, filename: str, *columns: List[str], overwrite: bool = False):
def __init__(self, filename: str, overwrite: bool = False):
self._filename = filename
self._columns = columns
self._columns = list(Audio.model_fields.keys())

# Write the columns to the CSV file
if overwrite or not os.path.exists(self._filename):
with open(self._filename, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(columns)
writer.writerow(self._columns)

def __call__(self, *items: List[str]):
"""Add a URL to the CSV file.
def __call__(self, audio: Audio):
"""Write the information of the audio to the CSV file.
Args:
items (List[str]): the items to add to the CSV file
audio (Audio): the audio object to write to the CSV file
"""

with open(self._filename, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(items)

# Write the values of the audio object to the CSV file
writer.writerow(
[
"" if getattr(audio, field) is None else getattr(audio, field)
for field in self._columns
]
)
17 changes: 0 additions & 17 deletions test.py

This file was deleted.

23 changes: 0 additions & 23 deletions test2.py

This file was deleted.

0 comments on commit 58053fd

Please sign in to comment.