-
Notifications
You must be signed in to change notification settings - Fork 2
/
pickle_functions.py
107 lines (82 loc) · 2.89 KB
/
pickle_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
###############
# MODULES #
###############
import io
import boto3
# import gcsfs
import pickle
import _pickle as cPickle
import pandas as pd
import boto3.session
#################
# FUNCTIONS #
#################
def write_pickle_to_s3bucket(filename, df, bucket_name):
'''
Pickles a data frame and sends to AWS s3 bucket.
Input(s):
filename - (string) the filename for the pickle
df - (data frame) the frame you want to pickle
bucket_name - (string) the name of the s3 bucket into which
pickle file will be dumped
'''
pickle_buffer = io.BytesIO()
s3_resource = boto3.resource('s3')
bucket = bucket_name
key = filename+'.pkl'
df.to_pickle(key)
s3_resource.Object(bucket,key).put(Body=open(key, 'rb'))
print("Pickled and sent to bucket!")
def read_pickle(bucket_name, filename):
'''
Reads a pickled object from s3 bucket and puts into pandas data frame.
Input(s):
filename - (string) the pickle object filename
bucket_name - (string) s3 bucket name
Output(s):
df - (data frame) Pandas data frame of pickle file
Code modified from: https://github.com/noopurrkalawatia/Dhvani/blob/master/Boto3_features.ipynb
'''
session = boto3.session.Session(region_name='us-east-1')
s3client = session.client('s3')
response = s3client.get_object(Bucket=bucket_name, Key=filename+'.pkl')
body_string = response['Body'].read()
df = cPickle.loads(body_string)
return df
def pickle_to_gcs(df, filename, bucket_name="no-hate", directory=None):
"""
Pickle a dataframe to a specified file in a given Google Cloud Storage
bucket, optionally within a specified directory.
Arguments:
df: a pandas dataframe
filename: filename to write to in a bucket given
bucket_name: the bucket to write to
directory: The directory to write to
"""
pickle_buffer = io.BytesIO()
fs = gcsfs.GCSFileSystem(project=project_id)
df.to_pickle(filename)
if directory:
directory = directory + "/"
else:
directory = ""
with fs.open(f"{bucket_name}/{directory}{filename}", "wb") as handle:
pickle.dump(df, handle)
def load_pickle_from_gcs(filename, bucket_name="no-hate", directory=None):
"""
Load a pickled dataframe from a specified file in a given Google Cloud
Storage bucket, optionally within a specified directory.
Arguments:
df: a pandas dataframe
filename: filename to write to in a bucket given
bucket_name: the bucket to write to
directory: The directory to write to
"""
fs = gcsfs.GCSFileSystem(project=project_id)
if directory:
directory = directory + "/"
else:
directory = ""
with fs.open(f"{bucket_name}/{directory}{filename}", "rb") as handle:
df = pickle.load(handle)
return df