-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cambridge CLI (test version) #2
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import click | ||
|
||
from functions import ( | ||
get_page_content, | ||
find_links_and_tags, | ||
download_file | ||
) | ||
|
||
@click.command() | ||
@click.option('--subject', prompt='Enter the subject', type=click.Choice(['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics'])) | ||
def main(subject): | ||
url = 'https://www.cambridge.org/core/services/librarians/kbart' | ||
response_text = get_page_content(url) | ||
|
||
if response_text: | ||
soup = BeautifulSoup(response_text, 'html.parser') | ||
|
||
prefix = 'cambridge ebooks and partner presses: 2023 ' | ||
subjects = [subject] | ||
|
||
found_links, tags = find_links_and_tags(soup, subjects, prefix) | ||
|
||
for link, subject in zip(found_links, subjects): | ||
link2 = 'https://www.cambridge.org' + link | ||
download_file(link2, link.split('/')[-1], subject) | ||
process_subject(subject) | ||
|
||
def process_subject(subject): | ||
script_dir = os.path.dirname(os.path.abspath(__file__)) | ||
file1_path = os.path.join(script_dir, f"{subject}.tsv") | ||
file2_path = os.path.join(script_dir, f"{subject}_test.tsv") | ||
|
||
unique_lines = find_unique_lines(file1_path, file2_path) | ||
|
||
click.echo("New releases:") | ||
for line in unique_lines: | ||
click.echo(line) | ||
|
||
def find_unique_lines(file1_path, file2_path): | ||
unique_lines = [] | ||
|
||
with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2: | ||
lines1 = set(file1.readlines()) | ||
lines2 = set(file2.readlines()) | ||
unique_lines = lines1.difference(lines2) | ||
|
||
return unique_lines | ||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,53 @@ | ||||||||||||||||||||||||||||||||||
import os | ||||||||||||||||||||||||||||||||||
import requests | ||||||||||||||||||||||||||||||||||
from bs4 import BeautifulSoup | ||||||||||||||||||||||||||||||||||
import random | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
def get_page_content(url): | ||||||||||||||||||||||||||||||||||
response = requests.get(url) | ||||||||||||||||||||||||||||||||||
if response.status_code == 200: | ||||||||||||||||||||||||||||||||||
return response.text | ||||||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||||||
print(f"Error: Failed to fetch the webpage ({response.status_code})") | ||||||||||||||||||||||||||||||||||
return None | ||||||||||||||||||||||||||||||||||
Comment on lines
+6
to
+12
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Also, since it's a cli it's better to use click's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this code, I get "Error: Failed to fetch the webpage (200)" and the rest of the code does not run. |
||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
def find_links_and_tags(soup, subjects, prefix): | ||||||||||||||||||||||||||||||||||
found_links = [] | ||||||||||||||||||||||||||||||||||
tags = [] | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
for subject in subjects: | ||||||||||||||||||||||||||||||||||
target_word = prefix + subject | ||||||||||||||||||||||||||||||||||
for tag in soup.find_all(string=lambda text: text and target_word in text.lower()): | ||||||||||||||||||||||||||||||||||
parent_tag = tag.parent | ||||||||||||||||||||||||||||||||||
if parent_tag.name == 'a' and parent_tag.get('href'): | ||||||||||||||||||||||||||||||||||
link = parent_tag.get('href') | ||||||||||||||||||||||||||||||||||
tag = parent_tag.get('data-ga-event-label') | ||||||||||||||||||||||||||||||||||
found_links.append(link) | ||||||||||||||||||||||||||||||||||
tags.append(tag) | ||||||||||||||||||||||||||||||||||
return found_links, tags | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
def download_file(url, target_filename, desired_filename): | ||||||||||||||||||||||||||||||||||
response = requests.get(url) | ||||||||||||||||||||||||||||||||||
if response.status_code == 200: | ||||||||||||||||||||||||||||||||||
filename = f"{desired_filename}.tsv" | ||||||||||||||||||||||||||||||||||
target_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) | ||||||||||||||||||||||||||||||||||
with open(target_filepath, 'wb') as file: | ||||||||||||||||||||||||||||||||||
file.write(response.content) | ||||||||||||||||||||||||||||||||||
print(f'Successfully downloaded {filename}') | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||||||
print(f"Error: Failed to download {desired_filename} ({response.status_code})") | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
def remove_random_lines(input_file, output_file): | ||||||||||||||||||||||||||||||||||
with open(input_file, 'r') as f: | ||||||||||||||||||||||||||||||||||
lines = f.readlines() | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
first_line = lines[0] # Preserve the first line | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
lines_to_remove = random.randint(0, min(5, len(lines) - 1)) # Ensure at least one line is kept | ||||||||||||||||||||||||||||||||||
remaining_lines = random.sample(lines[1:], max(len(lines) - 1 - lines_to_remove, 0)) | ||||||||||||||||||||||||||||||||||
lines_to_keep = [first_line] + remaining_lines | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
with open(output_file, 'w') as f: | ||||||||||||||||||||||||||||||||||
f.writelines(lines_to_keep) | ||||||||||||||||||||||||||||||||||
os.remove(input_file) | ||||||||||||||||||||||||||||||||||
Comment on lines
+42
to
+53
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by "random" lines? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is used for testing purposes. I'm removing a random assortment of lines from the files that I initially fetch from the publisher. I'm using this as the "old" data, so that when we fetch the new files using the cli, we have something to compare against. So essentially just for testing. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import os | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the difference between this and src/providers/cambridge/cambridge_cli.py? Would you like to just test it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, just for testing. |
||
import requests | ||
from bs4 import BeautifulSoup | ||
import random | ||
|
||
from functions import ( | ||
get_page_content, | ||
find_links_and_tags, | ||
download_file, | ||
remove_random_lines | ||
) | ||
|
||
def main(): | ||
url = 'https://www.cambridge.org/core/services/librarians/kbart' | ||
response_text = get_page_content(url) | ||
|
||
if response_text: | ||
soup = BeautifulSoup(response_text, 'html.parser') | ||
|
||
prefix = 'cambridge ebooks and partner presses: 2023 ' | ||
subjects = ['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics'] | ||
|
||
found_links, tags = find_links_and_tags(soup, subjects, prefix) | ||
|
||
for link, subject in zip(found_links, subjects): | ||
link2 = 'https://www.cambridge.org' + link | ||
download_file(link2, link.split('/')[-1], subject) | ||
|
||
subjectfiles = [] | ||
for subject in subjects: | ||
subjectfiles.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), subject) + '.tsv') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's make the download location a env var, so we can easily configure it |
||
for subject in subjectfiles: | ||
output_file = subject.replace('.tsv', '_test.tsv') | ||
remove_random_lines(subject, output_file) | ||
print(f"Random lines removed from {subject} and saved as {output_file}") | ||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you please rename it to
utils.py
?