Skip to content

Commit

Permalink
Update alt-text-scan.py.md
Browse files Browse the repository at this point in the history
removing script
  • Loading branch information
mgifford authored Nov 27, 2024
1 parent 9d76e73 commit 229fbda
Showing 1 changed file with 0 additions and 65 deletions.
65 changes: 0 additions & 65 deletions alt-text-scan.py.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,71 +130,6 @@ The script generates a CSV file named after the domain being analyzed, e.g., `ex

---

## Script

Below is the Python script:

```python
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse, urlunparse
import argparse
from tqdm import tqdm
import xml.etree.ElementTree as ET
import random
import time
from collections import defaultdict
import re
from textblob import TextBlob
from readability.readability import Document
from textstat import text_standard
from datetime import datetime

IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.tiff', '.avif', '.webp')

# Function definitions
def is_valid_image(url):
...

def parse_sitemap(sitemap_url, base_domain, headers=None, depth=3):
...

def crawl_site(start_url, max_pages=100, throttle=0):
...

def get_relative_url(url, base_domain):
...

def get_images(domain, sample_size=100, throttle=0, crawl_only=False):
...

def analyze_alt_text(images_df, domain, readability_threshold=8):
...

def process_image(img_url, img, page_url, domain, images_data):
...

def crawl_page(url, images_data, url_progress, domain, throttle, consecutive_errors):
...

# Main function
def main(domain, sample_size=100, throttle=0, crawl_only=False):
...

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Crawl a website and collect image data with alt text.")
parser.add_argument('domain', type=str, help='The domain to crawl (e.g., https://example.com)')
parser.add_argument('--sample_size', type=int, default=100, help='Number of URLs to sample from the sitemap')
parser.add_argument('--throttle', type=int, default=1, help='Throttle delay (in seconds) between requests')
parser.add_argument('--crawl_only', action='store_true', help='Start crawling directly without using the sitemap')
args = parser.parse_args()
main(args.domain, args.sample_size, throttle=args.throttle, crawl_only=args.crawl_only)
```

---

## Contributing

Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/CivicActions/site-evaluation-tools).
Expand Down

0 comments on commit 229fbda

Please sign in to comment.