Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kaggle download dataset in temp by argparse and tempfile #3

Open
davidkorea opened this issue Dec 20, 2018 · 0 comments
Open

Kaggle download dataset in temp by argparse and tempfile #3

davidkorea opened this issue Dec 20, 2018 · 0 comments

Comments

@davidkorea
Copy link
Owner

davidkorea commented Dec 20, 2018

1. Kaggle download dataset

Reference: tensorflow_official_word2vec_skipgram.ipynb

import os
import argparse
import sys
from tempfile import gettempdir
from six.moves import urllib

current_path = os.path.dirname(os.path.realpath(sys.argv[0]))

parser = argparse.ArgumentParser()
parser.add_argument(
    '--log_dir',
    type=str,
    default=os.path.join(current_path, 'log'),
    help='The log directory for TensorBoard summaries.')
FLAGS, unparsed = parser.parse_known_args()

# Create the directory for TensorBoard variables if there is not.
if not os.path.exists(FLAGS.log_dir):
    os.makedirs(FLAGS.log_dir)

# Step 1: Download the data.
# http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
url = ' http://www.fit.vutbr.cz/~imikolov/rnnlm/'

# pylint: disable=redefined-outer-name
def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                   local_filename)
    statinfo = os.stat(local_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + local_filename +
                    '. Can you get to it with a browser?')
    return local_filename


filename = maybe_download('simple-examples.tgz', 34869662)
filename
# '/tmp/simple-examples.tgz'
os.listdir('/tmp')
# ['.ipython', '.config', '.keras', 'simple-examples.tgz', '.local', '.cache']

2. Kaggle unzip .tgz file in temp

import tarfile
tarobj = tarfile.open(tgz_filename, "r:gz")
for tarinfo in tarobj:
    tarobj.extract(tarinfo.name, r"/tmp")
tarobj.close()

A new folder named 'simple-examples' will be created in /tmp, and go to find the .txt file

data_path = '/tmp/simple-examples/data'

with tf.gfile.GFile('/tmp/simple-examples/data/ptb.test.txt', "r") as f:
    print( f.read().replace("\n", "<eos>").split() )
        
@davidkorea davidkorea changed the title Kaggle download dataset temp Kaggle download dataset temp by argparse tempfile Dec 20, 2018
@davidkorea davidkorea changed the title Kaggle download dataset temp by argparse tempfile Kaggle download dataset in temp by argparse and tempfile Dec 20, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant