Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cambridge CLI (test version) #2

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions src/providers/cambridge/cambridge_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import requests
from bs4 import BeautifulSoup
import click

from functions import (
get_page_content,
find_links_and_tags,
download_file
)

@click.command()
@click.option('--subject', prompt='Enter the subject', type=click.Choice(['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics']))
def main(subject):
url = 'https://www.cambridge.org/core/services/librarians/kbart'
response_text = get_page_content(url)

if response_text:
soup = BeautifulSoup(response_text, 'html.parser')

prefix = 'cambridge ebooks and partner presses: 2023 '
subjects = [subject]

found_links, tags = find_links_and_tags(soup, subjects, prefix)

for link, subject in zip(found_links, subjects):
link2 = 'https://www.cambridge.org' + link
download_file(link2, link.split('/')[-1], subject)
process_subject(subject)

def process_subject(subject):
script_dir = os.path.dirname(os.path.abspath(__file__))
file1_path = os.path.join(script_dir, f"{subject}.tsv")
file2_path = os.path.join(script_dir, f"{subject}_test.tsv")

unique_lines = find_unique_lines(file1_path, file2_path)

click.echo("New releases:")
for line in unique_lines:
click.echo(line)

def find_unique_lines(file1_path, file2_path):
unique_lines = []

with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
lines1 = set(file1.readlines())
lines2 = set(file2.readlines())
unique_lines = lines1.difference(lines2)

return unique_lines

if __name__ == "__main__":
main()
53 changes: 53 additions & 0 deletions src/providers/cambridge/functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please rename it to utils.py?

import requests
from bs4 import BeautifulSoup
import random

def get_page_content(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Error: Failed to fetch the webpage ({response.status_code})")
return None
Comment on lines +6 to +12
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_page_content(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Error: Failed to fetch the webpage ({response.status_code})")
return None
def get_page_content(url):
response = requests.get(url)
try:
r = requests.get('http://www.google.com/nothere')
r.raise_for_status()
return r.text
except requests.exceptions.HTTPError as err:
print(f"Error: Failed to fetch the webpage ({response.status_code})")
return None

Also, since it's a cli it's better to use click's echo function

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this code, I get "Error: Failed to fetch the webpage (200)" and the rest of the code does not run.


def find_links_and_tags(soup, subjects, prefix):
found_links = []
tags = []

for subject in subjects:
target_word = prefix + subject
for tag in soup.find_all(string=lambda text: text and target_word in text.lower()):
parent_tag = tag.parent
if parent_tag.name == 'a' and parent_tag.get('href'):
link = parent_tag.get('href')
tag = parent_tag.get('data-ga-event-label')
found_links.append(link)
tags.append(tag)
return found_links, tags

def download_file(url, target_filename, desired_filename):
response = requests.get(url)
if response.status_code == 200:
filename = f"{desired_filename}.tsv"
target_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
with open(target_filepath, 'wb') as file:
file.write(response.content)
print(f'Successfully downloaded {filename}')

else:
print(f"Error: Failed to download {desired_filename} ({response.status_code})")

def remove_random_lines(input_file, output_file):
with open(input_file, 'r') as f:
lines = f.readlines()

first_line = lines[0] # Preserve the first line

lines_to_remove = random.randint(0, min(5, len(lines) - 1)) # Ensure at least one line is kept
remaining_lines = random.sample(lines[1:], max(len(lines) - 1 - lines_to_remove, 0))
lines_to_keep = [first_line] + remaining_lines

with open(output_file, 'w') as f:
f.writelines(lines_to_keep)
os.remove(input_file)
Comment on lines +42 to +53
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by "random" lines?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is used for testing purposes. I'm removing a random assortment of lines from the files that I initially fetch from the publisher. I'm using this as the "old" data, so that when we fetch the new files using the cli, we have something to compare against. So essentially just for testing.

39 changes: 39 additions & 0 deletions src/providers/cambridge/get_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the difference between this and src/providers/cambridge/cambridge_cli.py? Would you like to just test it?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, just for testing.

import requests
from bs4 import BeautifulSoup
import random

from functions import (
get_page_content,
find_links_and_tags,
download_file,
remove_random_lines
)

def main():
print
url = 'https://www.cambridge.org/core/services/librarians/kbart'
response_text = get_page_content(url)

if response_text:
soup = BeautifulSoup(response_text, 'html.parser')

prefix = 'cambridge ebooks and partner presses: 2023 '
subjects = ['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics']

found_links, tags = find_links_and_tags(soup, subjects, prefix)

for link, subject in zip(found_links, subjects):
link2 = 'https://www.cambridge.org' + link
download_file(link2, link.split('/')[-1], subject)

subjectfiles = []
for subject in subjects:
subjectfiles.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), subject) + '.tsv')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's make the download location a env var, so we can easily configure it

for subject in subjectfiles:
output_file = subject.replace('.tsv', '_test.tsv')
remove_random_lines(subject, output_file)
print(f"Random lines removed from {subject} and saved as {output_file}")

if __name__ == "__main__":
main()