diff --git a/alt-text-scan.py.md b/alt-text-scan.py.md index 9247912..8ca0809 100644 --- a/alt-text-scan.py.md +++ b/alt-text-scan.py.md @@ -130,71 +130,6 @@ The script generates a CSV file named after the domain being analyzed, e.g., `ex --- -## Script - -Below is the Python script: - -```python -import os -import requests -from bs4 import BeautifulSoup -import pandas as pd -from urllib.parse import urljoin, urlparse, urlunparse -import argparse -from tqdm import tqdm -import xml.etree.ElementTree as ET -import random -import time -from collections import defaultdict -import re -from textblob import TextBlob -from readability.readability import Document -from textstat import text_standard -from datetime import datetime - -IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.tiff', '.avif', '.webp') - -# Function definitions -def is_valid_image(url): - ... - -def parse_sitemap(sitemap_url, base_domain, headers=None, depth=3): - ... - -def crawl_site(start_url, max_pages=100, throttle=0): - ... - -def get_relative_url(url, base_domain): - ... - -def get_images(domain, sample_size=100, throttle=0, crawl_only=False): - ... - -def analyze_alt_text(images_df, domain, readability_threshold=8): - ... - -def process_image(img_url, img, page_url, domain, images_data): - ... - -def crawl_page(url, images_data, url_progress, domain, throttle, consecutive_errors): - ... - -# Main function -def main(domain, sample_size=100, throttle=0, crawl_only=False): - ... - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Crawl a website and collect image data with alt text.") - parser.add_argument('domain', type=str, help='The domain to crawl (e.g., https://example.com)') - parser.add_argument('--sample_size', type=int, default=100, help='Number of URLs to sample from the sitemap') - parser.add_argument('--throttle', type=int, default=1, help='Throttle delay (in seconds) between requests') - parser.add_argument('--crawl_only', action='store_true', help='Start crawling directly without using the sitemap') - args = parser.parse_args() - main(args.domain, args.sample_size, throttle=args.throttle, crawl_only=args.crawl_only) -``` - ---- - ## Contributing Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/CivicActions/site-evaluation-tools).