recodehive · sanjay-kv · May 19, 2024 · May 16, 2024
diff --git a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb
@@ -12,6 +12,8 @@
     "from sklearn.cluster import KMeans\n",
     "from sklearn.metrics import silhouette_score\n",
     "from nltk.sentiment import SentimentIntensityAnalyzer\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.decomposition import PCA\n",
     "\n",
     "# Load NLTK's sentiment analyzer\n",
     "sid = SentimentIntensityAnalyzer()\n",
@@ -36,11 +38,38 @@
     "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed\n",
     "tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n",
     "\n",
+    "# Optimal number of clusters\n",
+    "inertia = []\n",
+    "for k in range(2, 11):\n",
+    "    kmeans = KMeans(n_clusters=k, random_state=42)\n",
+    "    kmeans.fit(tfidf_matrix)\n",
+    "    inertia.append(kmeans.inertia_)\n",
+    "\n",
+    "plt.plot(range(2, 11), inertia, marker='o')\n",
+    "plt.xlabel('Number of Clusters')\n",
+    "plt.ylabel('Inertia')\n",
+    "plt.title('Elbow Method')\n",
+    "plt.show()\n",
+    "\n",
     "# Clustering with K-means\n",
     "k = 5  # Number of clusters (you can adjust this)\n",
     "kmeans = KMeans(n_clusters=k, random_state=42)\n",
     "kmeans.fit(tfidf_matrix)\n",
     "\n",
+    "# Dimensionality reduction for visualization\n",
+    "pca = PCA(n_components=2)\n",
+    "tfidf_matrix_2d = pca.fit_transform(tfidf_matrix.toarray())\n",
+    "\n",
+    "# Visualize clusters\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "for i in range(k):\n",
+    "    plt.scatter(tfidf_matrix_2d[kmeans.labels_ == i, 0], tfidf_matrix_2d[kmeans.labels_ == i, 1], label=f'Cluster {i+1}')\n",
+    "plt.title('Clusters Visualization')\n",
+    "plt.xlabel('PCA Component 1')\n",
+    "plt.ylabel('PCA Component 2')\n",
+    "plt.legend()\n",
+    "plt.show()\n",
+    "\n",
     "# Assign cluster labels to each review\n",
     "data['cluster_label'] = kmeans.labels_\n",
     "\n",