forked from getdozer/dozer-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset_and_upload_to_s3.py
109 lines (84 loc) · 3.62 KB
/
create_dataset_and_upload_to_s3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import random
import time
from datetime import datetime, timedelta
import boto3
import numpy as np
import pandas as pd
from rich.progress import Progress
def create_dataset(file_name, num_rows, start_date, end_date):
"""Create a dataset"""
with Progress() as progress:
task = progress.add_task("[cyan]Creating dataset...", total=num_rows)
# Generate dates from the specified start date to the specified end date
date_range = pd.date_range(start_date, end_date).tolist()
# Generate dates
dates = [random.choice(date_range) for _ in range(num_rows)]
# Generate random tickers
tickers = np.random.choice(['AAPL', 'GOOG', 'MSFT', 'AMZN', 'FB'], num_rows)
# Generate random open, high, low, close prices and volumes
opens = np.random.uniform(100, 200, num_rows)
highs = opens + np.random.uniform(0, 10, num_rows)
lows = opens - np.random.uniform(0, 10, num_rows)
closes = np.random.uniform(lows, highs, num_rows)
volumes = np.random.randint(5000, 10000, num_rows)
# Create a DataFrame
df = pd.DataFrame({
'Date': dates,
'Ticker': tickers,
'Open': opens,
'High': highs,
'Low': lows,
'Close': closes,
'Volume': volumes
})
# Write the DataFrame to a CSV file
df.to_csv(file_name, index=False)
progress.update(task, advance=num_rows)
def upload_dataset_to_s3(file_name):
"""Uploads a dataset to S3 bucket"""
with Progress() as progress:
task = progress.add_task("[green]Uploading dataset to S3...", total=100)
# Create a session using your AWS credentials
session = boto3.Session(
aws_access_key_id="YOUR_ACCESS_KEY",
aws_secret_access_key="YOUR_SECRET_KEY",
region_name="YOUR_REGION", # e.g. 'us-west-2'
)
# Create an S3 client using the session
s3 = session.client("s3")
# Name of the bucket
bucket_name = "your-bucket-name"
# Upload the file
s3.upload_file(
file_name, bucket_name, "stock_data_monthly/" + file_name
) # Updated to upload to 'stock_data_monthly' folder
progress.update(task, advance=100)
if __name__ == "__main__":
file_name = (
input(f"Enter the file name (default: stock_data_monthly.csv): ")
or "stock_data_monthly.csv"
)
if not file_name.endswith(".csv"):
file_name += ".csv"
num_rows = int(input(f"Enter the number of rows (default: 10000000): ") or 10000000)
while True:
start_year = int(input(f'Enter the start year (default: 2022): ') or 2022)
start_month = int(input(f'Enter the start month (1-12, default: 1): ') or 1)
end_year = int(input(f'Enter the end year (default: 2024): ') or 2024)
end_month = int(input(f'Enter the end month (1-12, default: 1): ') or 1)
start_date = datetime(start_year, start_month, 1)
end_date = datetime(end_year, end_month, 1) - timedelta(days=1)
if start_date >= end_date:
print("End date must be later than start date. Please try again.")
else:
break
start_time = time.time()
create_dataset(file_name=file_name, num_rows=num_rows, start_date=start_date, end_date=end_date)
end_time = time.time()
print(f'Execution time of create_dataset: {end_time - start_time:.2f} seconds')
start_time = time.time()
upload_dataset_to_s3(file_name=file_name)
end_time = time.time()
print(
f"Execution time of upload_dataset_to_s3: {end_time - start_time:.2f} seconds"
)