From 5b48048b838c172afc20176056108ca8424ff3ea Mon Sep 17 00:00:00 2001
From: Cheshta17 <122281116+Cheshta17@users.noreply.github.com>
Date: Thu, 16 May 2024 22:46:55 +0530
Subject: [PATCH] Add error handling, Change in clusters

---
 .../clustering movie review-checkpoint.ipynb  | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb
index 7d129ee4..a3c4f316 100644
--- a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb	
+++ b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb	
@@ -12,6 +12,8 @@
     "from sklearn.cluster import KMeans\n",
     "from sklearn.metrics import silhouette_score\n",
     "from nltk.sentiment import SentimentIntensityAnalyzer\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.decomposition import PCA\n",
     "\n",
     "# Load NLTK's sentiment analyzer\n",
     "sid = SentimentIntensityAnalyzer()\n",
@@ -36,11 +38,38 @@
     "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed\n",
     "tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n",
     "\n",
+    "# Optimal number of clusters\n",
+    "inertia = []\n",
+    "for k in range(2, 11):\n",
+    "    kmeans = KMeans(n_clusters=k, random_state=42)\n",
+    "    kmeans.fit(tfidf_matrix)\n",
+    "    inertia.append(kmeans.inertia_)\n",
+    "\n",
+    "plt.plot(range(2, 11), inertia, marker='o')\n",
+    "plt.xlabel('Number of Clusters')\n",
+    "plt.ylabel('Inertia')\n",
+    "plt.title('Elbow Method')\n",
+    "plt.show()\n",
+    "\n",
     "# Clustering with K-means\n",
     "k = 5  # Number of clusters (you can adjust this)\n",
     "kmeans = KMeans(n_clusters=k, random_state=42)\n",
     "kmeans.fit(tfidf_matrix)\n",
     "\n",
+    "# Dimensionality reduction for visualization\n",
+    "pca = PCA(n_components=2)\n",
+    "tfidf_matrix_2d = pca.fit_transform(tfidf_matrix.toarray())\n",
+    "\n",
+    "# Visualize clusters\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "for i in range(k):\n",
+    "    plt.scatter(tfidf_matrix_2d[kmeans.labels_ == i, 0], tfidf_matrix_2d[kmeans.labels_ == i, 1], label=f'Cluster {i+1}')\n",
+    "plt.title('Clusters Visualization')\n",
+    "plt.xlabel('PCA Component 1')\n",
+    "plt.ylabel('PCA Component 2')\n",
+    "plt.legend()\n",
+    "plt.show()\n",
+    "\n",
     "# Assign cluster labels to each review\n",
     "data['cluster_label'] = kmeans.labels_\n",
     "\n",