diff --git a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb index 7d129ee4..a3c4f316 100644 --- a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb +++ b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb @@ -12,6 +12,8 @@ "from sklearn.cluster import KMeans\n", "from sklearn.metrics import silhouette_score\n", "from nltk.sentiment import SentimentIntensityAnalyzer\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.decomposition import PCA\n", "\n", "# Load NLTK's sentiment analyzer\n", "sid = SentimentIntensityAnalyzer()\n", @@ -36,11 +38,38 @@ "tfidf_vectorizer = TfidfVectorizer(max_features=1000) # You can adjust max_features as needed\n", "tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n", "\n", + "# Optimal number of clusters\n", + "inertia = []\n", + "for k in range(2, 11):\n", + " kmeans = KMeans(n_clusters=k, random_state=42)\n", + " kmeans.fit(tfidf_matrix)\n", + " inertia.append(kmeans.inertia_)\n", + "\n", + "plt.plot(range(2, 11), inertia, marker='o')\n", + "plt.xlabel('Number of Clusters')\n", + "plt.ylabel('Inertia')\n", + "plt.title('Elbow Method')\n", + "plt.show()\n", + "\n", "# Clustering with K-means\n", "k = 5 # Number of clusters (you can adjust this)\n", "kmeans = KMeans(n_clusters=k, random_state=42)\n", "kmeans.fit(tfidf_matrix)\n", "\n", + "# Dimensionality reduction for visualization\n", + "pca = PCA(n_components=2)\n", + "tfidf_matrix_2d = pca.fit_transform(tfidf_matrix.toarray())\n", + "\n", + "# Visualize clusters\n", + "plt.figure(figsize=(10, 6))\n", + "for i in range(k):\n", + " plt.scatter(tfidf_matrix_2d[kmeans.labels_ == i, 0], tfidf_matrix_2d[kmeans.labels_ == i, 1], label=f'Cluster {i+1}')\n", + "plt.title('Clusters Visualization')\n", + "plt.xlabel('PCA Component 1')\n", + "plt.ylabel('PCA Component 2')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", "# Assign cluster labels to each review\n", "data['cluster_label'] = kmeans.labels_\n", "\n",