Merge pull request #257 from bthrx/webscraper

Webscraper
danielmiessler · Mar 16, 2024 · c5d847a · c5d847a
2 parents 371f16f + 98814b4
commit c5d847a
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 0 deletions.
diff --git a/helper_files/README.md b/helper_files/README.md
@@ -89,3 +89,26 @@ tags: fabric-extraction stub-for-name extra-tag
 ---
 test
 ```
+
+## ws (Web Scraper)
+
+`ws` is a command that allows users to scrape text from specific HTML elements on a webpage. It offers flexibility in selecting which elements to scrape and how to post-process the extracted content. This tool can be invaluable for data extraction and automation tasks, fitting seamlessly into a range of workflows.
+
+```bash
+usage: ws [-u URL] [-e ELEMENT] [-p PROCESS] [-d]
+
+ws (web scraper) extracts text from specified elements within the body of a given webpage, with options for post-processing actions, such as decomposing or extracting specific tags. It is based on beautifulsoup. 
+
+options:
+  -u, --url       URL of the webpage to scrape
+  -e, --element   Comma-separated list of elements to match within the body. Example: "p,code"
+  -p, --process   Comma-separated list of post-process actions by tag and action. Example: "p:decompose=script, code:extract=span"
+  -d, --debug     Enable debug mode to print matched element names, useful for deciding what elements to match and process
+  -h, --help      Show this help message and exit
+
+Examples:
+  Scrape paragraphs and code snippets: ws --url "https://example.com" --element "p,code"
+  Remove scripts from divs, extract spans from paragraphs: ws --url "https://example.com" --element "div,p" --process "div:decompose=script, p:extract=span"
+```
+
+This tool is designed to be highly configurable, allowing users to specify which elements to extract and how to modify the extracted content for further processing. The debug mode prints what element matches come from making it easier to refine selections.
diff --git a/installer/client/cli/ws.py b/installer/client/cli/ws.py
@@ -0,0 +1,68 @@
+import argparse
+import sys
+import requests
+from bs4 import BeautifulSoup
+
+# Argument parsing setup
+parser = argparse.ArgumentParser(description='Extract text from specified elements within the body of a given webpage, with options for post-processing. Example: --element "p,code" --process "div:decompose=script, p:extract=span"')
+parser.add_argument('-u', '--url', help='URL of the webpage to scrape', default=None)
+parser.add_argument('-e', '--element', help='Comma-separated list of elements to match within the body.', default='')
+parser.add_argument('-p', '--process', help='Comma-separated list of post-process actions by tag and action, e.g., "div:decompose=script, p:extract=span".', default='')
+parser.add_argument('-d', '--debug', action='store_true', help='Enable debug mode to print matched element names.')
+args = parser.parse_args()
+
+# Function definitions
+def get_input_url():
+    if not sys.stdin.isatty():
+        return sys.stdin.readline().strip()
+    return None
+
+def apply_processing(element, actions):
+    for action in actions:
+        target, command = action.split(':')
+        action, tags = command.split('=')
+        if element.name == target:
+            for tag in tags.split(','):
+                for sub_element in element.find_all(tag.strip()):
+                    getattr(sub_element, action)()
+
+def process_element_text(element):
+    texts = []
+    for child in element.children:
+        if hasattr(child, 'name') and child.name:
+            child_text = child.get_text(strip=True)
+            if child_text:
+                texts.append(child_text + '\n')
+        elif child.string:
+            texts.append(child.string.strip())
+    return ' '.join(texts)
+
+# Main code logic
+if args.url is None:
+    args.url = get_input_url()
+
+if not args.url:
+    sys.stderr.write("Error: No URL provided. Please provide a URL as an argument or pipe one in.\n")
+    sys.exit(1)
+
+tags = set()
+if args.element:
+    tags.update([tag.strip() for tag in args.element.split(',')])
+
+process_actions = [action.strip() for action in args.process.split(',')] if args.process else []
+
+response = requests.get(args.url)
+if response.status_code != 200:
+    sys.stderr.write(f"Error: Failed to fetch the URL with status code {response.status_code}.\n")
+    sys.exit(1)
+
+soup = BeautifulSoup(response.text, 'html.parser')
+body_content = soup.body
+
+for element in body_content.find_all(tags):
+    if args.debug:
+        print(f"Matched element: {element.name}")
+    apply_processing(element, process_actions)
+    processed_text = process_element_text(element)
+    print(processed_text)
+