From 5b48048b838c172afc20176056108ca8424ff3ea Mon Sep 17 00:00:00 2001 From: Cheshta17 <122281116+Cheshta17@users.noreply.github.com> Date: Thu, 16 May 2024 22:46:55 +0530 Subject: [PATCH] Add error handling, Change in clusters --- .../clustering movie review-checkpoint.ipynb | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb index 7d129ee4..a3c4f316 100644 --- a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb +++ b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb @@ -12,6 +12,8 @@ "from sklearn.cluster import KMeans\n", "from sklearn.metrics import silhouette_score\n", "from nltk.sentiment import SentimentIntensityAnalyzer\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.decomposition import PCA\n", "\n", "# Load NLTK's sentiment analyzer\n", "sid = SentimentIntensityAnalyzer()\n", @@ -36,11 +38,38 @@ "tfidf_vectorizer = TfidfVectorizer(max_features=1000) # You can adjust max_features as needed\n", "tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n", "\n", + "# Optimal number of clusters\n", + "inertia = []\n", + "for k in range(2, 11):\n", + " kmeans = KMeans(n_clusters=k, random_state=42)\n", + " kmeans.fit(tfidf_matrix)\n", + " inertia.append(kmeans.inertia_)\n", + "\n", + "plt.plot(range(2, 11), inertia, marker='o')\n", + "plt.xlabel('Number of Clusters')\n", + "plt.ylabel('Inertia')\n", + "plt.title('Elbow Method')\n", + "plt.show()\n", + "\n", "# Clustering with K-means\n", "k = 5 # Number of clusters (you can adjust this)\n", "kmeans = KMeans(n_clusters=k, random_state=42)\n", "kmeans.fit(tfidf_matrix)\n", "\n", + "# Dimensionality reduction for visualization\n", + "pca = PCA(n_components=2)\n", + "tfidf_matrix_2d = pca.fit_transform(tfidf_matrix.toarray())\n", + "\n", + "# Visualize clusters\n", + "plt.figure(figsize=(10, 6))\n", + "for i in range(k):\n", + " plt.scatter(tfidf_matrix_2d[kmeans.labels_ == i, 0], tfidf_matrix_2d[kmeans.labels_ == i, 1], label=f'Cluster {i+1}')\n", + "plt.title('Clusters Visualization')\n", + "plt.xlabel('PCA Component 1')\n", + "plt.ylabel('PCA Component 2')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", "# Assign cluster labels to each review\n", "data['cluster_label'] = kmeans.labels_\n", "\n",