From 077d46ddf59dc9ec1913b9b8fe8fd669f9baec67 Mon Sep 17 00:00:00 2001 From: Bryan <91551702+blondon1@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:13:57 -0500 Subject: [PATCH 1/2] Update pdfToText.py Improved error handling --- AUTOMATION/PDF To Text/pdfToText.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/AUTOMATION/PDF To Text/pdfToText.py b/AUTOMATION/PDF To Text/pdfToText.py index 811b88bf..7ce3151c 100644 --- a/AUTOMATION/PDF To Text/pdfToText.py +++ b/AUTOMATION/PDF To Text/pdfToText.py @@ -25,10 +25,12 @@ def convert_pdf(filename): # If any Error is encountered, Print the Error on Screen except Exception as e: - print(f'Error Converting PDF to Text or Saving Converted Text into .txt file: {e}') + print(f'Error: {e}') + if out_filename.exists(): + out_filename.unlink() return None if __name__ == '__main__': file = input('Enter Full Path and FileName: ') - convert_pdf(file) \ No newline at end of file + convert_pdf(file) From 565c2ed33eaa50d096665f81226d815eab573dfb Mon Sep 17 00:00:00 2001 From: Bryan <91551702+blondon1@users.noreply.github.com> Date: Tue, 30 Jan 2024 10:57:03 -0500 Subject: [PATCH 2/2] improved for robustness, efficiency, and code quality check the HTTP response status explicitly, robust parsing to check the elements exist before accessing them, also when dealing with file operations, it's always a good practice to use the with statement, Added a user-agent to the request header because some websites might block requests that do not originate from a browser --- AUTOMATION/Web_Scraper/app.py | 71 ++++++++++++++++------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/AUTOMATION/Web_Scraper/app.py b/AUTOMATION/Web_Scraper/app.py index b2ee1f80..9367a94a 100644 --- a/AUTOMATION/Web_Scraper/app.py +++ b/AUTOMATION/Web_Scraper/app.py @@ -2,53 +2,48 @@ import requests import openpyxl - def extract_brand_name_and_title(name): - # Split the name and return the first word as the brand name and the rest as title brand, title = name.split(' ', 1) return brand, title - -def scrape_graphics_cards_data(): +def get_page_content(url): + headers = {'User-Agent': 'Mozilla/5.0'} + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.text + +def parse_html(html): + soup = BeautifulSoup(html, 'html.parser') + return soup.find('div', class_='main-products product-grid').find_all( + 'div', class_='product-layout has-extra-button') + +def write_to_excel(cards, file_path): + excel = openpyxl.Workbook() + sheet = excel.active + sheet.title = "price" + sheet.append(['Brand', 'Name', 'Price']) + + for card in cards: + name = card.find('div', class_='name').a.text + brand, title = extract_brand_name_and_title(name) + price = card.find('div', class_='price').span.text + sheet.append([brand, title, price]) + + with open(file_path, 'wb') as f: + excel.save(f) + +def scrape_graphics_cards_data(file_path='Graphics Card.xlsx'): try: - # Create a new Excel workbook and set up the worksheet - excel = openpyxl.Workbook() - sheet = excel.active - sheet.title = "price" - sheet.append(['Brand', 'Name', 'Price']) - url = 'https://www.techlandbd.com/pc-components/graphics-card?sort=p.price&order=ASC&fq=1&limit=100' - response = requests.get(url) - response.raise_for_status() - - # Parse the HTML content - soup = BeautifulSoup(response.text, 'html.parser') - - # Find all product cards on the webpage - cards = soup.find('div', class_='main-products product-grid').find_all( - 'div', class_='product-layout has-extra-button') - - for card in cards: - # Extract the product name - name = card.find('div', class_='name').a.text - - # Split the name to get the brand and title - brand, title = extract_brand_name_and_title(name) - - # Extract the product price - price = card.find('div', class_='price').span.text - - # Print the product details and add them to the Excel sheet - print(brand, title, price) - sheet.append([brand, title, price]) - - # Save the Excel file - excel.save('Graphics Card.xlsx') + html = get_page_content(url) + cards = parse_html(html) + write_to_excel(cards, file_path) + except requests.RequestException as e: + print("Network error:", e) except Exception as e: print("An error occurred:", e) - if __name__ == "__main__": - # Call the main scraping function scrape_graphics_cards_data() +