cern-sis · jdm010 · Aug 16, 2023 · Aug 16, 2023 · drjova · Aug 16, 2023
diff --git a/src/providers/cambridge/cambridge_cli.py b/src/providers/cambridge/cambridge_cli.py
@@ -0,0 +1,53 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+import click
+
+from functions import (
+    get_page_content,
+    find_links_and_tags,
+    download_file
+)
+
+@click.command()
+@click.option('--subject', prompt='Enter the subject', type=click.Choice(['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics']))
+def main(subject):
+    url = 'https://www.cambridge.org/core/services/librarians/kbart'
+    response_text = get_page_content(url)
+
+    if response_text:
+        soup = BeautifulSoup(response_text, 'html.parser')
+
+        prefix = 'cambridge ebooks and partner presses: 2023 '
+        subjects = [subject]
+
+        found_links, tags = find_links_and_tags(soup, subjects, prefix)
+
+        for link, subject in zip(found_links, subjects):
+            link2 = 'https://www.cambridge.org' + link
+            download_file(link2, link.split('/')[-1], subject)
+    process_subject(subject)
+
+def process_subject(subject):
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    file1_path = os.path.join(script_dir, f"{subject}.tsv")
+    file2_path = os.path.join(script_dir, f"{subject}_test.tsv")
+
+    unique_lines = find_unique_lines(file1_path, file2_path)
+
+    click.echo("New releases:")
+    for line in unique_lines:
+        click.echo(line)
+
+def find_unique_lines(file1_path, file2_path):
+    unique_lines = []
+
+    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
+        lines1 = set(file1.readlines())
+        lines2 = set(file2.readlines())
+        unique_lines = lines1.difference(lines2)
+
+    return unique_lines
+
+if __name__ == "__main__":
+    main()
diff --git a/src/providers/cambridge/functions.py b/src/providers/cambridge/functions.py
@@ -0,0 +1,53 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+import random
+
+def get_page_content(url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text
+    else:
+        print(f"Error: Failed to fetch the webpage ({response.status_code})")
+        return None
-def get_page_content(url):
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.text
-    else:
-        print(f"Error: Failed to fetch the webpage ({response.status_code})")
-        return None
+def get_page_content(url):
+    response = requests.get(url)
+    try:
+        r = requests.get('http://www.google.com/nothere')
+        r.raise_for_status()
+        return r.text
+    except requests.exceptions.HTTPError as err:
+        print(f"Error: Failed to fetch the webpage ({response.status_code})")
+        return None
-def get_page_content(url):
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.text
-    else:
-        print(f"Error: Failed to fetch the webpage ({response.status_code})")
-        return None
+def get_page_content(url):
+    response = requests.get(url)
+    try:
+        r = requests.get('http://www.google.com/nothere')
+        r.raise_for_status()
+        return r.text
+    except requests.exceptions.HTTPError as err:
+        print(f"Error: Failed to fetch the webpage ({response.status_code})")
+        return None
+
+def find_links_and_tags(soup, subjects, prefix):
+    found_links = []
+    tags = []
+
+    for subject in subjects:
+        target_word = prefix + subject
+        for tag in soup.find_all(string=lambda text: text and target_word in text.lower()):
+            parent_tag = tag.parent
+            if parent_tag.name == 'a' and parent_tag.get('href'):
+                link = parent_tag.get('href')
+                tag = parent_tag.get('data-ga-event-label')
+                found_links.append(link)
+                tags.append(tag)
+    return found_links, tags
+
+def download_file(url, target_filename, desired_filename):
+    response = requests.get(url)
+    if response.status_code == 200:
+        filename = f"{desired_filename}.tsv"
+        target_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
+        with open(target_filepath, 'wb') as file:
+            file.write(response.content)
+        print(f'Successfully downloaded {filename}')
+
+    else:
+        print(f"Error: Failed to download {desired_filename} ({response.status_code})")
+
+def remove_random_lines(input_file, output_file):
+    with open(input_file, 'r') as f:
+        lines = f.readlines()
+
+    first_line = lines[0]  # Preserve the first line
+
+    lines_to_remove = random.randint(0, min(5, len(lines) - 1))  # Ensure at least one line is kept
+    remaining_lines = random.sample(lines[1:], max(len(lines) - 1 - lines_to_remove, 0))
+    lines_to_keep = [first_line] + remaining_lines
+
+    with open(output_file, 'w') as f:
+        f.writelines(lines_to_keep)
+    os.remove(input_file)
diff --git a/src/providers/cambridge/get_test_data.py b/src/providers/cambridge/get_test_data.py
@@ -0,0 +1,39 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+import random
+
+from functions import (
+    get_page_content,
+    find_links_and_tags,
+    download_file,
+    remove_random_lines
+)
+
+def main():
+    print
+    url = 'https://www.cambridge.org/core/services/librarians/kbart'
+    response_text = get_page_content(url)
+
+    if response_text:
+        soup = BeautifulSoup(response_text, 'html.parser')
+
+        prefix = 'cambridge ebooks and partner presses: 2023 '
+        subjects = ['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics']
+
+        found_links, tags = find_links_and_tags(soup, subjects, prefix)
+
+        for link, subject in zip(found_links, subjects):
+            link2 = 'https://www.cambridge.org' + link
+            download_file(link2, link.split('/')[-1], subject)
+
+        subjectfiles = []
+        for subject in subjects:
+            subjectfiles.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), subject) + '.tsv')
+        for subject in subjectfiles:
+            output_file = subject.replace('.tsv', '_test.tsv')
+            remove_random_lines(subject, output_file)
+            print(f"Random lines removed from {subject} and saved as {output_file}")
+
+if __name__ == "__main__":
+    main()