From 29af2e80eb0cfe4be36b6068b9c2567f087f18dd Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Wed, 18 Dec 2024 18:53:02 +0700 Subject: [PATCH] GHA CAR daily report (#201) * faithful report * fix * fix * Automated Change * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * move to python, parallize * fix * fix * fix * fix * remove old file * run every 1h * every 4h --------- Co-authored-by: Lusitaniae --- .github/faithful-data-report.py | 229 ++++++++++++++++++++++++++++++ .github/workflows/data-report.yml | 41 ++++++ 2 files changed, 270 insertions(+) create mode 100644 .github/faithful-data-report.py create mode 100644 .github/workflows/data-report.yml diff --git a/.github/faithful-data-report.py b/.github/faithful-data-report.py new file mode 100644 index 00000000..fac6b2cc --- /dev/null +++ b/.github/faithful-data-report.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +import asyncio +import aiohttp +from typing import Dict, Optional +from dataclasses import dataclass + +@dataclass +class EpochData: + epoch: int + car: str = "n/a" + sha: str = "n/a" + sha_url: str = "n/a" + size: str = "n/a" + poh: str = "n/a" + poh_url: str = "n/a" + txmeta: str = "n/a" + txmeta_url: str = "n/a" + deals: str = "n/a" + indices: str = "n/a" + indices_size: str = "n/a" + +class FaithfulDataReport: + def __init__(self): + self.host = "https://files.old-faithful.net" + self.deals_host = "https://filecoin-car-storage-cdn.b-cdn.net" + + async def check_url(self, session: aiohttp.ClientSession, url: str) -> bool: + try: + async with session.head(url, allow_redirects=True) as response: + return response.status == 200 + except: + return False + + async def fetch_text(self, session: aiohttp.ClientSession, url: str) -> Optional[str]: + try: + async with session.get(url) as response: + if response.status == 200: + return await response.text() + except: + pass + return None + + async def get_size(self, session: aiohttp.ClientSession, url: str) -> str: + try: + async with session.head(url) as response: + if response.status == 200: + size_bytes = int(response.headers.get('content-length', 0)) + size_gb = round(size_bytes / (1024 * 1024 * 1024)) + return str(size_gb) + except: + pass + return "n/a" + + async def get_indices(self, session: aiohttp.ClientSession, epoch: int) -> str: + cid_url = f"{self.host}/{epoch}/epoch-{epoch}.cid" + + # Get the CID first + bafy = await self.fetch_text(session, cid_url) + if not bafy: + return "n/a" + + # Check all required index files + index_files = [ + f"epoch-{epoch}-{bafy}-mainnet-cid-to-offset-and-size.index", + f"epoch-{epoch}-{bafy}-mainnet-sig-to-cid.index", + f"epoch-{epoch}-{bafy}-mainnet-sig-exists.index", + f"epoch-{epoch}-{bafy}-mainnet-slot-to-cid.index", + f"epoch-{epoch}-gsfa.index.tar.zstd" + ] + + checks = await asyncio.gather(*[ + self.check_url(session, f"{self.host}/{epoch}/{file}") + for file in index_files + ]) + + return f"{self.host}/{epoch}/epoch-{epoch}-indices" if all(checks) else "n/a" + + async def get_indices_size(self, session: aiohttp.ClientSession, epoch: int) -> str: + cid_url = f"{self.host}/{epoch}/epoch-{epoch}.cid" + + # Get the CID first + bafy = await self.fetch_text(session, cid_url) + if not bafy: + return "n/a" + + # Check all required index files + index_files = [ + f"epoch-{epoch}-{bafy}-mainnet-cid-to-offset-and-size.index", + f"epoch-{epoch}-{bafy}-mainnet-sig-to-cid.index", + f"epoch-{epoch}-{bafy}-mainnet-sig-exists.index", + f"epoch-{epoch}-{bafy}-mainnet-slot-to-cid.index", + f"epoch-{epoch}-gsfa.index.tar.zstd" + ] + + sizes = await asyncio.gather(*[ + self.get_size(session, f"{self.host}/{epoch}/{file}") + for file in index_files + ]) + + # Convert sizes to integers, treating "n/a" as 0 + size_ints = [int(size) if size != "n/a" else 0 for size in sizes] + + # Sum up all sizes + total_size = sum(size_ints) + + return str(total_size) if total_size > 0 else "n/a" + + async def get_deals(self, session: aiohttp.ClientSession, epoch: int) -> str: + deals_url = f"{self.deals_host}/{epoch}/deals.csv" + deals_content = await self.fetch_text(session, deals_url) + + if deals_content and len(deals_content.splitlines()) > 1: + return deals_url + return "n/a" + + async def get_epoch_data(self, session: aiohttp.ClientSession, epoch: int) -> EpochData: + car_url = f"{self.host}/{epoch}/epoch-{epoch}.car" + sha_url = f"{self.host}/{epoch}/epoch-{epoch}.sha256" + poh_url = f"{self.host}/{epoch}/poh-check.log" + txmeta_url = f"{self.host}/{epoch}/tx-metadata-check.log" + + # Check if CAR exists first + car_exists = await self.check_url(session, car_url) + if not car_exists: + return EpochData(epoch=epoch) + + # Gather all data concurrently + sha, size, poh, txmeta, indices, indices_size, deals = await asyncio.gather( + self.fetch_text(session, sha_url), + self.get_size(session, car_url), + self.fetch_text(session, poh_url), + self.fetch_text(session, txmeta_url), + self.get_indices(session, epoch), + self.get_indices_size(session, epoch), + self.get_deals(session, epoch) + ) + + return EpochData( + epoch=epoch, + car=car_url, + sha=sha if sha else "n/a", + sha_url=sha_url, + size=size, + poh=poh if poh else "n/a", + poh_url=poh_url, + txmeta=txmeta if txmeta else "n/a", + txmeta_url=txmeta_url, + deals=deals, + indices=indices, + indices_size=indices_size + ) + + def format_row(self, data: EpochData) -> str: + car_cell = f"[epoch-{data.epoch}.car]({data.car})" if data.car != "n/a" else "✗" + sha_cell = f"[{data.sha[:7]}]({data.sha_url})" if data.sha != "n/a" else "✗" + size_cell = f"{data.size} GB" if data.size != "n/a" else "✗" + txmeta_cell = f"[✓]({data.txmeta_url})" if validate_txmeta_output(data.txmeta) else "✗" + poh_cell = f"[✓]({data.poh_url})" if validate_poh_output(data.poh) else "✗" + indices_cell = "✓" if data.indices != "n/a" else "✗" + indices_size_cell = f"{data.indices_size} GB" if data.indices_size != "n/a" else "✗" + deals_cell = f"[✓]({data.deals})" if data.deals != "n/a" else "✗" + + return f"| {data.epoch} | {car_cell} | {sha_cell} | {size_cell} | {txmeta_cell} | {poh_cell} | {indices_cell} | {indices_size_cell} | {deals_cell} |" + + async def get_current_epoch(self) -> int: + async with aiohttp.ClientSession() as session: + async with session.post( + 'https://api.mainnet-beta.solana.com', + json={"jsonrpc":"2.0","id":1, "method":"getEpochInfo"} + ) as response: + data = await response.json() + return int(data['result']['epoch']) + + async def run(self): + current_epoch = await self.get_current_epoch() + epochs = range(current_epoch, -1, -1) # descending order + + print("| Epoch # | CAR | CAR SHA256 | CAR filesize | tx meta check | poh check | Indices | Indices Size | Filecoin Deals |") + print("|---|---|---|---|---|---|---|---|---|") + + # concurrency levels + chunk_size = 20 + + async with aiohttp.ClientSession() as session: + for i in range(0, len(epochs), chunk_size): + chunk = epochs[i:i + chunk_size] + results = await asyncio.gather( + *[self.get_epoch_data(session, epoch) for epoch in chunk] + ) + + # Print results in order + for result in results: + print(self.format_row(result)) + +def validate_txmeta_output(txmeta_text: str) -> bool: + """ + Validates that txmeta check output shows zero missing and zero parsing errors + Returns True if valid, False otherwise + """ + if txmeta_text == "n/a": + return False + + try: + return 'Transactions with missing metadata: 0' in txmeta_text and \ + 'Transactions with metadata parsing error: 0' in txmeta_text + + except Exception as e: + return False + +def validate_poh_output(poh_text: str) -> bool: + """ + Validates the PoH check output + Returns True if valid, False otherwise + """ + if poh_text == "n/a": + return False + + try: + return 'Successfully checked PoH on CAR file for epoch' in poh_text + + except: + return False + +def main(): + report = FaithfulDataReport() + asyncio.run(report.run()) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/.github/workflows/data-report.yml b/.github/workflows/data-report.yml new file mode 100644 index 00000000..86b649a2 --- /dev/null +++ b/.github/workflows/data-report.yml @@ -0,0 +1,41 @@ +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +name: Data Report Generator +on: + push: + schedule: + - cron: '3 */4 * * *' # Run once an hour + workflow_dispatch: # Allow manual trigger + +jobs: + generate-report: + runs-on: ubuntu-latest + + permissions: + # Give the default GITHUB_TOKEN write permission to commit and push the + # added or changed files to the repository. + contents: write + + steps: + - uses: actions/checkout@v4 + + - name: Generate Report + run: | + mkdir -p docs + python3 -m pip install aiohttp + python3 .github/faithful-data-report.py > docs/CAR-REPORT.md + + # Commit changed files back to the repository + - uses: EndBug/add-and-commit@v9 + with: + message: Old Faithful CAR data report update + new_branch: 'gha-report' + add: 'docs/*.md' + author_name: github-actions[bot] + author_email: 41898282+github-actions[bot]@users.noreply.github.com + fetch: origin gha-report + push: origin gha-report --force + pull: ' ' +