Skip to content

Commit

Permalink
Fix bug in uploads
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilwoodruff committed Sep 17, 2024
1 parent 84ab3ff commit 153339b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 51 deletions.
2 changes: 1 addition & 1 deletion policyengine_uk_data/storage/upload_completed_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
FOLDER = Path(__file__).parent

FILES = [
"cps_2022_23.h5",
"frs_2022_23.h5",
"enhanced_frs_2022_23.h5",
"extended_frs_2022_23.h5",
"reweighted_frs_2022_23.h5",
Expand Down
77 changes: 27 additions & 50 deletions policyengine_uk_data/utils/github.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os
import requests
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time

auth_headers = {
Expand Down Expand Up @@ -62,66 +60,45 @@ def download(
f.write(response.content)


def create_session_with_retries():
session = requests.Session()
retries = Retry(
total=5, backoff_factor=1, status_forcelist=[502, 503, 504]
)
session.mount("https://", HTTPAdapter(max_retries=retries))
return session


def upload(
org: str, repo: str, release_tag: str, file_name: str, file_path: str
) -> bytes:
release_id = get_release_id(org, repo, release_tag)

# First, list release assets
url = f"https://api.github.com/repos/{org}/{repo}/releases/{release_id}/assets"
response = requests.get(url, headers=auth_headers).json()
names = [asset["name"] for asset in response]
if file_name in names:
print(
f"Asset {file_name} already exists in release {release_tag} of {org}/{repo}, skipping."
)
return

url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}"

file_size = os.path.getsize(file_path)
headers = {
"Accept": "application/vnd.github.v3+json",
"Content-Type": "application/octet-stream",
**auth_headers,
}

session = create_session_with_retries()

max_retries = 3
for attempt in range(max_retries):
try:
with open(file_path, "rb") as f:
with tqdm(total=file_size, unit="B", unit_scale=True) as pbar:
response = session.post(
url,
headers=headers,
data=f,
stream=True,
hooks=dict(
response=lambda r, *args, **kwargs: pbar.update(
len(r.content)
)
),
timeout=300, # 5 minutes timeout
)

if response.status_code == 201:
return response.json()
else:
print(
f"Attempt {attempt + 1} failed with status code {response.status_code}. Response: {response.text}"
)

except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} failed with error: {str(e)}")

if attempt < max_retries - 1:
wait_time = (
attempt + 1
) * 60 # Wait 1 minute, then 2 minutes, then 3 minutes
print(f"Waiting {wait_time} seconds before retrying...")
time.sleep(wait_time)

raise ValueError(f"Failed to upload file after {max_retries} attempts.")
with open(file_path, "rb") as f:
data = f.read()

response = requests.post(
url,
headers=headers,
data=data,
)

if response.status_code != 201:
raise ValueError(
f"Invalid response code {response.status_code} for url {url}. Received: {response.text}"
)

return response.json()



def set_pr_auto_review_comment(text: str):
Expand Down

0 comments on commit 153339b

Please sign in to comment.