From 893f136176890224acd553648e689f0d67bfaa59 Mon Sep 17 00:00:00 2001
From: Felipe <60966475+fevieira27@users.noreply.github.com>
Date: Fri, 19 Jan 2024 22:20:37 +0000
Subject: [PATCH] Update DeezerAnalysisAI.R

Fixed issue with ID column
---
 DeezerAnalysisAI.R | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/DeezerAnalysisAI.R b/DeezerAnalysisAI.R
index 5c5fac4..53fb624 100644
--- a/DeezerAnalysisAI.R
+++ b/DeezerAnalysisAI.R
@@ -67,22 +67,24 @@ songs <- get_songs(url)
 # Convert to dataframe
 df_songs <- bind_rows(lapply(songs, as.data.frame.list))
 
-# str(df_songs)
-
+# Selecting only columns that matter
 df_songs_filtered <- df_songs %>% select(id, title, title_short, isrc, duration, rank, type, artist.id, artist.name, artist.type, album.id, album.title, album.type)
 
-print(df_songs_filtered)
+# Review the results
+# print(df_songs_filtered)
 
+# Cleaning artist name to lowercase and changing header name
 dupSongs_df <- data.frame(tolower(df_songs_filtered$artist.name), stringsAsFactors = FALSE)
 colnames(dupSongs_df) <- "artist"
 
+# Cleaning song title to lowercase
 dupSongs_df$title <- tolower(df_songs_filtered$title)
 
-dupSongs_df$duration <- tolower(df_songs_filtered$duration)
+# Adding duration to DF
+dupSongs_df$duration <- df_songs_filtered$duration
 
-dupSongs_df <- rownames_to_column(dupSongs_df, var = "ID")
-
-print(dupSongs_df)
+# Review the results
+# print(dupSongs_df)
 
 ############## Using Levenshtein Similarity
 
@@ -95,8 +97,8 @@ rpairsLeven <- epiWeights(rpairsLeven)
 # Get pairs with a high probability of being duplicates
 duplicatesLeven <- getPairs(rpairsLeven, min.weight=0.79, max.weight=0.99)
 
-# summary(epiClassify(rpairsLeven,0.6))
 # Review the results
+# summary(epiClassify(rpairsLeven,0.6))
 # print(duplicatesLeven)
 
 
@@ -128,7 +130,10 @@ subLV$source <- "Levenshtein"
 # Appending all those into a dataframe, filtering by similarity of artist, title and durantion
 duplicatesTotal <- rbind(subJW, subLV)
 
-# Join back to the original dataset, to get track info and allow validation of duplicates
+# Creating ID column based on row number to original DF
+dupSongs_df <- rownames_to_column(dupSongs_df, var = "ID")
+
+# Join total duplicates found with the original dataset, to get track info and allow validation of duplicates
 finalDupSongs <- merge(duplicatesTotal, dupSongs_df, by.x = "id1", by.y = "ID")
 # colnames(finalDupSongs) <- c("id1","id2","artist_sim","title_sim","duration_sim","is_match","AvgWeight","source","artist_1","title_1","duration_1")
 finalDupSongs <- merge(finalDupSongs, dupSongs_df, by.x = "id2", by.y = "ID")