This is an NLP (Natural Language Processing) project that focuses on gathering data from Twitter and labeling the topics of the tweets using ChatGPT with supervision of a human annotator.
Conda and Poetry are required to run this project.
conda create -n nlp-project python=3.11
conda activate nlp-project
# (Required for PyTorch to work)
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
poetry install
cd src
copy .env.sample .env
touch users.csv
Phase 1: Crawl -> Label -> Clean/Punc -> Break by word/sentence -> Analyze
python main.py --help
python main.py scrape-twitter
python main.py example-labeling
python main.py label-data
Data Cleaning(In process of cleaning, we use chatGPT for translating tweets that have more than 40% English tokens. Other details explained in the report)
python main.py clean-data path-to-labeled-csv
example: python src/main.py clean-data ../data/raw/labeled_2023-06-02-10-27-57.csv
python main.py break-by-word path-to-cleaned-csv
example: python src/main.py break-by-word ../data/clean/cleaned_2023-06-02-10-27-57.csv
python main.py break-by-sentence path-to-punc-csv
example: python src/main.py break-by-sentence ../data/clean/punc_2023-06-02-10-27-57.csv
python main.py get-stats file-timestamp
example: python src/main.py get-stats 2023-06-02-10-27-57
python main.py generate-pdf-report file-timestamp
example: python src/main.py generate-pdf-report 2023-06-02-10-27-57
python src/main.py augment-data path-to-cleaned-csv
example: python src/main.py augment-data ../data/clean/cleaned_2023-06-02-10-27-57.csv
example: python src/main.py augment-data ../data/clean/cleaned_2023-06-02-10-27-57.csv --min-tweet-count-per-label 100
python src/main.py train-word2vec-label path-to-augmented-csv label
example: python src/main.py train-word2vec-label ../data/augment/augmented_2023-06-02-10-27-57.csv home_and_garden
python src/main.py train-word2vec-preselected path-to-augmented-csv
example: python src/main.py train-word2vec-preselected ../data/augment/augmented_2023-06-02-10-27-57.csv
python src/main.py train-word2vec-all path-to-augmented-csv
example: python src/main.py train-word2vec-all ../data/augment/augmented_2023-06-02-10-27-57.csv
python src/main.py get-most-similar-words label word --topn 10
example: python src/main.py get-most-similar-words all سیاست --topn 10
python src/main.py get-word2vec-stats
python src/main.py train-tokenizer path-to-augmented-csv
example: python src/main.py train-tokenizer ../data/augment/augmented_2023-06-02-10-27-57.csv
example: python src/main.py train-tokenizer ../data/augment/augmented_2023-06-02-10-27-57.csv --vocab-size 2000
python src/main.py fine-tune-gpt2 path-to-augmented-csv
example: python src/main.py fine-tune-gpt2 ../data/augment/augmented_2023-06-02-10-27-57.csv
example: python src/main.py fine-tune-gpt2 ../data/augment/augmented_2023-06-02-10-27-57.csv --desired-label home_and_garden
python src/main.py complete-prompt-gpt2 prompt label
example: python src/main.py complete-prompt-gpt2 'سیاستمدار همه دروغ' politics_and_current_affairs
python src/main.py fine-tune-parsbert path-to-augmented-csv
example: python src/main.py fine-tune-parsbert ../data/augment/augmented_2023-06-02-10-27-57.csv
python src/main.py test-parsbert
python src/main.py classify-tweet-parsbert tweet
example: python src/main.py classify-tweet-parsbert 'باید به گل و گیاه رسید.'
python src/main.py test-openai
python src/main.py classify-tweet-openai tweet
example: python src/main.py classify-tweet-openai 'باید به گل و گیاه رسید.'
python src/main.py generate-final-pdf-report