From c7153a4a9c18fb6707571647b3cf04dfac3ddc0f Mon Sep 17 00:00:00 2001
From: epe12345 <132982056+epe12345@users.noreply.github.com>
Date: Sun, 26 May 2024 00:21:11 +0900
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=EC=9C=84=EC=B9=98=EC=98=88?=
 =?UTF-8?q?=EC=B8=A1,=20=EC=9D=98=EB=AF=B8=EC=9E=A5=EC=86=8C=20=EA=B8=B0?=
 =?UTF-8?q?=EB=8A=A5=20=EB=B6=84=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ai/LocationAnalyzer.py | 237 ++++++++++++++++++++++++++---------------
 ai/LocationPredict.py  | 182 ++-----------------------------
 2 files changed, 158 insertions(+), 261 deletions(-)

diff --git a/ai/LocationAnalyzer.py b/ai/LocationAnalyzer.py
index ed841d80..162b25ed 100644
--- a/ai/LocationAnalyzer.py
+++ b/ai/LocationAnalyzer.py
@@ -1,127 +1,190 @@
-from pyclustering.cluster.gmeans import gmeans
-from collections import Counter
 import numpy as np
 import pandas as pd
+import json
+import math
+from scipy.spatial import distance
+from sklearn.metrics.pairwise import haversine_distances
+from pyclustering.cluster.gmeans import gmeans
+from collections import Counter
 import warnings
 
 warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거
 pd.set_option('mode.chained_assignment', None)
 
 class LocationAnalyzer:
-    def __init__(self, filename):
-        self.df = self.fileReader(filename)
-
-    # 파일 읽기
-    # 데이터 예시 (39.984702,116.318417,0,492,39744.1201851852,2008-10-23,02:53:04)
-    # (위도, 경도, 0, 고도, 1899년 이후 경과한 시간, 날짜, 시간)
-    def fileReader(self, filename):
-
-        latitude = []   # 위도
-        longitude = []  # 경도
-        date = []       # 날짜
-        time = []       # 시간
-
-        with open(filename, 'r') as file:
-            data = file.read()
-
-        # 데이터에 불필요한 부분 제거
-        # 추후 데이터 형식에 따라 수정 필요 *
-        # data = data.split('\n')[:-1]
-        data = data.split('\n')[6:-1]
-        for i in range(len(data)):
-            line = data[i].split(',')
-            latitude.append(line[0])    # 위도
-            longitude.append(line[1])   # 경도
-            #date.append(line[2])        # 날짜
-            #time.append(line[3])        # 시간
-            date.append(line[5])
-            time.append(line[6])
-        df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})
+    def __init__(self, csv_path) -> None:
+        self.df = pd.DataFrame()
+        self.fileReader(csv_path)
 
+    def convert_day_to_number(self, day):
+        weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+        return weekdays.index(day)
     
-        df['latitude'] = df['latitude'].astype(float)
-        df['longitude'] = df['longitude'].astype(float)
-        df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
-        df['datetime'] = df['datetime'].dt.floor('T')
-        # 시간대와 요일 추가
-        # 시간대 형식 : f00t04 f20t24
-        # 4시간 단위로 분리
-        df['hour_block'] = 'f' + ((df['datetime'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((df['datetime'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
-        df['day_of_week'] = df['datetime'].dt.day_name()
-        df = df.drop(['date', 'time'], axis=1)
-        df = df.drop_duplicates(['datetime'], ignore_index=True)
-
-        return df
-
-    # 의미장소 추출
-    def gmeansFit(self, df):
-        # 두 열을 선택하고 넘파이 배열로 변환
-        selectedColumns = ['latitude', 'longitude']
-        resultList = df[selectedColumns].values.tolist()    # 리스트로 변환
+    def custom_parser(self, date_string):
+        return pd.to_datetime(date_string, format='%Y-%m-%d')  # 날짜 형식에 맞게 지정
     
-        gmeansInstance = gmeans(resultList).process()       # 클러스터링
+    def fileReader(self, csv_path):
+        data = pd.read_csv(csv_path, parse_dates=['date'], date_parser=self.custom_parser)
+        index = list(range(len(data)))
+        data.index = index
+        self.df = data[['date', 'time', 'latitude', 'longitude', 'user_status']]
 
-        centers = gmeansInstance.get_centers()              # 클러스터의 중심 (의미장소)
-        clusters = gmeansInstance.get_clusters()            # 분류된 클러스터
+        self.df['date'] = pd.to_datetime(self.df['date'], format='%Y-%m-%d')
+        self.df['time'] = pd.to_datetime(self.df['time'], format='%H:%M:%S')
 
+        self.df['hour_block'] = 'f' + ((self.df['time'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((self.df['time'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
+        self.df['day_of_week'] = self.df['date'].dt.day_name()
 
-        return clusters, centers
-    
-    # 호출 함수
-    def gmeansFunc(self):
+        new_data = []
+        for item in self.df['hour_block']:
+            num = int(item[1:-3])
+            new_data.append(num)
 
+        self.df['hour_block'] = new_data
+        self.df['day_of_week'] = self.df['day_of_week'].apply(self.convert_day_to_number)
+        self.df = self.df.drop(['date', 'time'], axis=1)
 
-        clusters, centers = self.gmeansFit(self.df)
+    def gmeans_fit(self):
+        # 두 열을 선택하고 넘파이 배열로 변환
+        selected_columns = ['latitude', 'longitude']
+        result_list = self.df[selected_columns].values.tolist()
 
-        data_df = pd.DataFrame({"clusters":clusters, "centers":centers})
+        gmeans_instance = gmeans(result_list).process()
+
+        centers = gmeans_instance.get_centers()
+        clusters = gmeans_instance.get_clusters()
         
+        return clusters, centers
+
+    def gmeans_func(self):
+        clusters, centers = self.gmeans_fit()
+
+        data_df = pd.DataFrame({"clusters": clusters, "centers": centers})
+
         for k in range(len(data_df.clusters)):
-            if (len(data_df.clusters[k]) < 10):
+            if len(data_df.clusters[k]) < 10:
                 data_df.drop(index=k, inplace=True)
         data_df = data_df.sort_index(axis=1)
         data_df = data_df.reset_index(drop=True)
-    
-        self.df['clusterNo'] = -1
+        
+        self.df['cluster_no'] = -1
         for i in range(len(data_df)):
             for j in range(len(data_df['clusters'].iloc[i])):
                 k = data_df['clusters'].iloc[i][j]
-                self.df['clusterNo'].iloc[k] = i
-
-        self.df = self.df[self.df['clusterNo'] != -1]
-
-
+                self.df['cluster_no'].iloc[k] = i
+        
         data_df['hour_block'] = 0
         data_df['day_of_week'] = 0
-        for i in range(max(self.df['clusterNo'])+1):
-        
-            counter = Counter(self.df[self.df['clusterNo'] == i]['hour_block'])
+        for i in range(max(self.df['cluster_no']) + 1):
+            counter = Counter(self.df[self.df['cluster_no'] == i]['hour_block'])
             most_hour_value = counter.most_common(1)[0][0]
 
-            counter = Counter(self.df[self.df['clusterNo'] == i]['day_of_week'])
+            counter = Counter(self.df[self.df['cluster_no'] == i]['day_of_week'])
             most_day_value = counter.most_common(1)[0][0]
 
             data_df['hour_block'].iloc[i] = most_hour_value
             data_df['day_of_week'].iloc[i] = most_day_value
 
-        data_list = data_df.values.tolist()
-        return data_list
+        data_df[['latitude', 'longitude']] = data_df['centers'].apply(lambda x: pd.Series(x))
+        data_df.drop('centers', axis=1, inplace=True)
+        data_df = data_df[['latitude', 'longitude', 'clusters', 'hour_block', 'day_of_week']]
+
+        meaningful_df = data_df[['latitude', 'longitude', 'hour_block', 'day_of_week']]
+        meaningful_df[['latitude', 'longitude']] = meaningful_df[['latitude', 'longitude']].round(4)
+        meaningful_df = meaningful_df.drop_duplicates(['latitude', 'longitude'], keep='first', ignore_index=True)
+
+        return meaningful_df
+    
+    def haversine(self, lat1, lon1, lat2, lon2):
+        R = 6371000.0  # 지구 반지름 (미터)
+        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+
+        a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
+        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
+        distance = R * c
+        return distance
+
+    def calculate_bearing(self, lat1, lon1, lat2, lon2):
+        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
+        dlon = lon2 - lon1
+        x = math.sin(dlon) * math.cos(lat2)
+        y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon)
+        initial_bearing = math.atan2(x, y)
+        initial_bearing = math.degrees(initial_bearing)
+        compass_bearing = (initial_bearing + 360) % 360
+        return compass_bearing
     
+    def add_movement_features(self):
+        self.df['next_lat'] = self.df['latitude'].shift(1)
+        self.df['next_lon'] = self.df['longitude'].shift(1)
 
+        self.df['distance'] = self.df.apply(lambda row: self.haversine(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
+        self.df['bearing'] = self.df.apply(lambda row: self.calculate_bearing(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
 
-if __name__ == '__main__':
-    # 파일 경로 가져오기
-    filePath = r"C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20081202160051.txt"
-    la = LocationAnalyzer(filePath)
+        self.df = self.df.drop(['next_lat', 'next_lon'], axis=1)
+        self.df = self.df.fillna(0)
+
+    def map_to_meaningful_places(self, meaningful_df):
+        y, m_hour_block, m_day_of_week = [], [], []
+        for i in range(len(self.df)):
+            current_location = (self.df['latitude'].iloc[i], self.df['longitude'].iloc[i])
+            min_distance = float('inf')
+
+            for j in range(len(meaningful_df)):
+                place_location = (meaningful_df['latitude'].iloc[j], meaningful_df['longitude'].iloc[j])
+                dist = distance.euclidean(current_location, place_location)
+                if dist < min_distance:
+                    min_distance = dist
+                    min_distance_index = j
+
+            y.append(min_distance_index)
+            m_hour_block.append(meaningful_df['hour_block'].iloc[min_distance_index])
+            m_day_of_week.append(meaningful_df['day_of_week'].iloc[min_distance_index])
+
+        self.df['m_hour_block'] = m_hour_block
+        self.df['m_day_of_week'] = m_day_of_week
+        self.df['y'] = y
+
+    def calculate_additional_features(self):
+        self.df['speed'] = self.df['distance'] / (self.df['hour_block'] / 4 + 1)
+        self.df['lat_rate_change'] = self.df['latitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
+        self.df['lon_rate_change'] = self.df['longitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
+
+        daily_variability = self.df.groupby('day_of_week')[['latitude', 'longitude']].std().add_suffix('_daily_var')
+        hourly_variability = self.df.groupby('hour_block')[['latitude', 'longitude']].std().add_suffix('_hourly_var')
+        self.df = self.df.join(daily_variability, on='day_of_week')
+        self.df = self.df.join(hourly_variability, on='hour_block')
 
-    data = la.gmeansFunc()
+        self.df['max_travel_range'] = self.df.groupby('hour_block')['distance'].transform('max')
+
+        self.df['movement_direction'] = self.df['bearing'].apply(lambda x: 0 if x < 180 else 1)
+        self.df = self.df.fillna(0)
+
+    
+    def run_analysis(self):
+        meaningful_df = self.gmeans_func()
+        self.add_movement_features()
+        self.map_to_meaningful_places(meaningful_df)
+        self.calculate_additional_features()
+        
+        y = self.df['y']
+        self.df = self.df.drop(['y'], axis=1)
+        self.df['y'] = y
+
+        return self.df, meaningful_df
     
+
+
+if __name__ == '__main__':
+
+    # csv 파일 가져오기
+    # 필요한 데이터 'date', 'time', 'latitude', 'longitude', 'user_status' : 날짜 시간 위도 경도 이동상태
+    csv_path = r"C:\Users\sk002\Downloads\138362.csv"
+    la = LocationAnalyzer(csv_path)
+
+    df, meaningful_df = la.run_analysis()
     
-    print(data[1])
-    print(data[1][0])
-    print(data[1][0][0]) # latitude
-    print(data[1][0][1]) # longitude
-    print(data[1][2]) # time
-    print(data[1][3]) # w
-
-    print(type(data[1][2]))
-    print(type(data[1][3]))
+    print(df)
+    print(meaningful_df)
\ No newline at end of file
diff --git a/ai/LocationPredict.py b/ai/LocationPredict.py
index 808c82af..414b6dc4 100644
--- a/ai/LocationPredict.py
+++ b/ai/LocationPredict.py
@@ -4,7 +4,6 @@
 import tensorflow as tf
 import json
 import math
-import folium
 import tensorflow.keras.backend as K
 from tensorflow import keras
 from scipy.spatial import distance
@@ -22,6 +21,7 @@
 from tensorflow.keras.utils import to_categorical
 from pyclustering.cluster.gmeans import gmeans
 from collections import Counter
+from LocationAnalyzer import LocationAnalyzer
 
 class ForecastLSTMClassification:
 
@@ -312,190 +312,24 @@ def pred(self,
 
         return y_pred
 
-# 전처리
-class Preprocessing:
-    def __init__(self, csv_path) -> None:
-        self.df = pd.DataFrame()
-        self.fileReader(csv_path)
-
-    def convert_day_to_number(self, day):
-        weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
-        return weekdays.index(day)
-    
-    def custom_parser(self, date_string):
-        return pd.to_datetime(date_string, format='%Y-%m-%d')  # 날짜 형식에 맞게 지정
-    
-    def fileReader(self, csv_path):
-        data = pd.read_csv(csv_path, parse_dates=['date'], date_parser=self.custom_parser)
-        index = list(range(len(data)))
-        data.index = index
-        self.df = data[['date', 'time', 'latitude', 'longitude', 'user_status']]
-
-        self.df['date'] = pd.to_datetime(self.df['date'], format='%Y-%m-%d')
-        self.df['time'] = pd.to_datetime(self.df['time'], format='%H:%M:%S')
-
-        self.df['hour_block'] = 'f' + ((self.df['time'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((self.df['time'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
-        self.df['day_of_week'] = self.df['date'].dt.day_name()
-
-        new_data = []
-        for item in self.df['hour_block']:
-            num = int(item[1:-3])
-            new_data.append(num)
-
-        self.df['hour_block'] = new_data
-        self.df['day_of_week'] = self.df['day_of_week'].apply(self.convert_day_to_number)
-        self.df = self.df.drop(['date', 'time'], axis=1)
-
-    def gmeans_fit(self):
-        # 두 열을 선택하고 넘파이 배열로 변환
-        selected_columns = ['latitude', 'longitude']
-        result_list = self.df[selected_columns].values.tolist()
-
-        gmeans_instance = gmeans(result_list).process()
-
-        centers = gmeans_instance.get_centers()
-        clusters = gmeans_instance.get_clusters()
-        
-        return clusters, centers
-
-    def gmeans_func(self):
-        clusters, centers = self.gmeans_fit()
-
-        data_df = pd.DataFrame({"clusters": clusters, "centers": centers})
-
-        for k in range(len(data_df.clusters)):
-            if len(data_df.clusters[k]) < 10:
-                data_df.drop(index=k, inplace=True)
-        data_df = data_df.sort_index(axis=1)
-        data_df = data_df.reset_index(drop=True)
-        
-        self.df['cluster_no'] = -1
-        for i in range(len(data_df)):
-            for j in range(len(data_df['clusters'].iloc[i])):
-                k = data_df['clusters'].iloc[i][j]
-                self.df['cluster_no'].iloc[k] = i
-        
-        data_df['hour_block'] = 0
-        data_df['day_of_week'] = 0
-        for i in range(max(self.df['cluster_no']) + 1):
-            counter = Counter(self.df[self.df['cluster_no'] == i]['hour_block'])
-            most_hour_value = counter.most_common(1)[0][0]
-
-            counter = Counter(self.df[self.df['cluster_no'] == i]['day_of_week'])
-            most_day_value = counter.most_common(1)[0][0]
-
-            data_df['hour_block'].iloc[i] = most_hour_value
-            data_df['day_of_week'].iloc[i] = most_day_value
-
-        data_df[['latitude', 'longitude']] = data_df['centers'].apply(lambda x: pd.Series(x))
-        data_df.drop('centers', axis=1, inplace=True)
-        data_df = data_df[['latitude', 'longitude', 'clusters', 'hour_block', 'day_of_week']]
-
-        meaningful_df = data_df[['latitude', 'longitude', 'hour_block', 'day_of_week']]
-        meaningful_df[['latitude', 'longitude']] = meaningful_df[['latitude', 'longitude']].round(4)
-        meaningful_df = meaningful_df.drop_duplicates(['latitude', 'longitude'], keep='first', ignore_index=True)
-
-        return meaningful_df
-    
-    def haversine(self, lat1, lon1, lat2, lon2):
-        R = 6371000.0  # 지구 반지름 (미터)
-        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
-        dlat = lat2 - lat1
-        dlon = lon2 - lon1
-
-        a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
-        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
-        distance = R * c
-        return distance
-
-    def calculate_bearing(self, lat1, lon1, lat2, lon2):
-        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
-        dlon = lon2 - lon1
-        x = math.sin(dlon) * math.cos(lat2)
-        y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon)
-        initial_bearing = math.atan2(x, y)
-        initial_bearing = math.degrees(initial_bearing)
-        compass_bearing = (initial_bearing + 360) % 360
-        return compass_bearing
-    
-    def add_movement_features(self):
-        self.df['next_lat'] = self.df['latitude'].shift(1)
-        self.df['next_lon'] = self.df['longitude'].shift(1)
-
-        self.df['distance'] = self.df.apply(lambda row: self.haversine(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
-        self.df['bearing'] = self.df.apply(lambda row: self.calculate_bearing(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
-
-        self.df = self.df.drop(['next_lat', 'next_lon'], axis=1)
-        self.df = self.df.fillna(0)
-
-    def map_to_meaningful_places(self, meaningful_df):
-        y, m_hour_block, m_day_of_week = [], [], []
-        for i in range(len(self.df)):
-            current_location = (self.df['latitude'].iloc[i], self.df['longitude'].iloc[i])
-            min_distance = float('inf')
-
-            for j in range(len(meaningful_df)):
-                place_location = (meaningful_df['latitude'].iloc[j], meaningful_df['longitude'].iloc[j])
-                dist = distance.euclidean(current_location, place_location)
-                if dist < min_distance:
-                    min_distance = dist
-                    min_distance_index = j
-
-            y.append(min_distance_index)
-            m_hour_block.append(meaningful_df['hour_block'].iloc[min_distance_index])
-            m_day_of_week.append(meaningful_df['day_of_week'].iloc[min_distance_index])
-
-        self.df['m_hour_block'] = m_hour_block
-        self.df['m_day_of_week'] = m_day_of_week
-        self.df['y'] = y
-
-    def calculate_additional_features(self):
-        self.df['speed'] = self.df['distance'] / (self.df['hour_block'] / 4 + 1)
-        self.df['lat_rate_change'] = self.df['latitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
-        self.df['lon_rate_change'] = self.df['longitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
-
-        daily_variability = self.df.groupby('day_of_week')[['latitude', 'longitude']].std().add_suffix('_daily_var')
-        hourly_variability = self.df.groupby('hour_block')[['latitude', 'longitude']].std().add_suffix('_hourly_var')
-        self.df = self.df.join(daily_variability, on='day_of_week')
-        self.df = self.df.join(hourly_variability, on='hour_block')
-
-        self.df['max_travel_range'] = self.df.groupby('hour_block')['distance'].transform('max')
-
-        self.df['movement_direction'] = self.df['bearing'].apply(lambda x: 0 if x < 180 else 1)
-        self.df = self.df.fillna(0)
-
-    
-    def run_analysis(self):
-        meaningful_df = self.gmeans_func()
-        self.add_movement_features()
-        self.map_to_meaningful_places(meaningful_df)
-        self.calculate_additional_features()
-        
-        y = self.df['y']
-        self.df = self.df.drop(['y'], axis=1)
-        self.df['y'] = y
-
-        return self.df
     
 if __name__ =='__main__':
     
-    # 전처리
-    csv_path = r"C:\Users\sk002\Downloads\138362.csv"
-    pr = Preprocessing(csv_path)
-    df = pr.run_analysis()
+    la = LocationAnalyzer(r"C:\Users\sk002\Downloads\138362.csv")
+    df, meaningful_df = la.run_analysis()
 
     test_idx = int(len(df) * 0.8)
     df_train = df.iloc[:test_idx]
     df_test = df.iloc[test_idx:]
 
     # 파라미터 설정
-    seq_len = 150  # 150개의 데이터를 feature로 사용
-    steps = 150  # 향후 150개 뒤의 y를 예측
-    single_output = False
+    seq_len = 15  # 150개의 데이터를 feature로 사용
+    steps = 15  # 향후 150개 뒤의 y를 예측
+    single_output = True
     metrics = ["accuracy"]  # 모델 성능 지표
     lstm_params = {
         "seq_len": seq_len,
-        "epochs": 100,  # epochs 반복 횟수
+        "epochs": 10,  # epochs 반복 횟수
         "patience": 30,  # early stopping 조건
         "steps_per_epoch": 5,  # 1 epochs 시 dataset을 5개로 분할하여 학습
         "learning_rate": 0.03,
@@ -505,7 +339,7 @@ def run_analysis(self):
         "validation_split": 0.3,  # 검증 데이터셋 30%
     }
     fl = ForecastLSTMClassification(class_num=len(df['y'].unique()))
-    model = fl.fit_lstm(
+    fl.fit_lstm(
         df=df_train,
         steps=steps,
         single_output=single_output,

From 0fb0dfd8b624bb5dc44be9d043f876440c412c66 Mon Sep 17 00:00:00 2001
From: epe12345 <132982056+epe12345@users.noreply.github.com>
Date: Fri, 7 Jun 2024 02:32:53 +0900
Subject: [PATCH 2/2] =?UTF-8?q?feat:=20=EC=9C=84=EC=B9=98=20=EC=98=88?=
 =?UTF-8?q?=EC=B8=A1=20=EB=AA=A8=EB=8D=B8=20=EB=B3=80=EA=B2=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- huggingface의 peft 적용을 위해 tensorflow에서 pytorch로 변경
---
 ai/LocationPredictUsingPEFT.py | 180 +++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 ai/LocationPredictUsingPEFT.py

diff --git a/ai/LocationPredictUsingPEFT.py b/ai/LocationPredictUsingPEFT.py
new file mode 100644
index 00000000..e84b15c6
--- /dev/null
+++ b/ai/LocationPredictUsingPEFT.py
@@ -0,0 +1,180 @@
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset, random_split
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import accuracy_score
+from LocationAnalyzer import LocationAnalyzer
+
+class ForecastLSTMClassification(nn.Module):
+    def __init__(self, class_num: int, input_dim: int, hidden_dim: int, layer_dim: int, output_dim: int, dropout_prob: float = 0.2):
+        super(ForecastLSTMClassification, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.layer_dim = layer_dim
+
+        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob)
+        self.fc = nn.Linear(hidden_dim, output_dim)
+        self.softmax = nn.Softmax(dim=1)
+
+    def forward(self, x):
+        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
+        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
+
+        out, _ = self.lstm(x, (h0, c0))
+        out = self.fc(out[:, -1, :])
+        out = self.softmax(out)
+        return out
+
+class LSTMModel:
+    def __init__(self, class_num: int, random_seed: int = 1234):
+        self.random_seed = random_seed
+        self.class_num = class_num
+        torch.manual_seed(random_seed)
+        np.random.seed(random_seed)
+
+    def reshape_dataset(self, df: pd.DataFrame) -> np.array:
+        dataset = df.values.reshape(df.shape)
+        return dataset
+
+    def split_sequences(self, dataset: np.array, seq_len: int, steps: int, single_output: bool) -> tuple:
+        X, y = [], []
+        for i in range(len(dataset) - seq_len - steps + 1):
+            idx_in = i + seq_len
+            idx_out = idx_in + steps
+
+            if idx_out > len(dataset):
+                break
+
+            seq_x = dataset[i:idx_in, :-1]
+            seq_y = dataset[idx_in:idx_out, -1]
+
+            X.append(seq_x)
+            y.append(seq_y[0] if single_output else seq_y)
+
+        X = np.array(X)
+        y = np.array(y)
+        return X, y
+
+    def split_train_valid_dataset(self, df: pd.DataFrame, seq_len: int, steps: int, single_output: bool, validation_split: float = 0.2) -> tuple:
+        dataset = self.reshape_dataset(df=df)
+        X, y = self.split_sequences(dataset=dataset, seq_len=seq_len, steps=steps, single_output=single_output)
+
+        dataset_size = len(X)
+        train_size = int(dataset_size * (1-validation_split))
+        valid_size = dataset_size - train_size
+
+        X_train, y_train = torch.tensor(X[:train_size, :], dtype=torch.float32), torch.tensor(y[:train_size], dtype=torch.long)
+        X_val, y_val = torch.tensor(X[train_size:, :], dtype=torch.float32), torch.tensor(y[train_size:], dtype=torch.long)
+
+        train_dataset = TensorDataset(X_train, y_train)
+        val_dataset = TensorDataset(X_val, y_val)
+
+        return train_dataset, val_dataset
+
+    def build_and_compile_lstm_model(self, seq_len: int, n_features: int, hidden_dim: int, layer_dim: int, dropout_prob: float = 0.2, learning_rate: float = 0.001):
+        model = ForecastLSTMClassification(self.class_num, n_features, hidden_dim, layer_dim, self.class_num, dropout_prob)
+        criterion = nn.CrossEntropyLoss()
+        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+        return model, criterion, optimizer
+
+    def fit_lstm(self, df: pd.DataFrame, steps: int, hidden_dim: int, layer_dim: int, dropout_prob: float, seq_len: int, single_output: bool, epochs: int, batch_size: int, validation_split: float, learning_rate: float):
+        train_dataset, val_dataset = self.split_train_valid_dataset(df=df, seq_len=seq_len, steps=steps, single_output=single_output, validation_split=validation_split)
+        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+        val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
+
+        model, criterion, optimizer = self.build_and_compile_lstm_model(seq_len=seq_len, n_features=train_dataset[0][0].shape[1], hidden_dim=hidden_dim, layer_dim=layer_dim, dropout_prob=dropout_prob, learning_rate=learning_rate)
+
+        model.train()
+        for epoch in range(epochs):
+            for X_batch, y_batch in train_loader:
+                optimizer.zero_grad()
+                outputs = model(X_batch)
+                loss = criterion(outputs, y_batch.view(-1))
+                loss.backward()
+                optimizer.step()
+            print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
+
+        return model
+
+    def forecast_validation_dataset(self, model, val_loader):
+        model.eval()
+        y_pred_list, y_val_list = [], []
+
+        with torch.no_grad():
+            for X_batch, y_batch in val_loader:
+                outputs = model(X_batch)
+                _, predicted = torch.max(outputs.data, 1)
+                y_pred_list.extend(predicted.tolist())
+                y_val_list.extend(y_batch.tolist())
+        return pd.DataFrame({"y": y_val_list, "yhat": y_pred_list})
+
+    def pred(self, df: pd.DataFrame, model, steps: int, seq_len: int, single_output: bool, batch_size: int):
+        dataset = self.reshape_dataset(df=df)
+        X_test, y_test = self.split_sequences(dataset=dataset, seq_len=seq_len, steps=steps, single_output=single_output)
+
+        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
+        y_test_tensor = torch.tensor(y_test, dtype=torch.long)
+
+        test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=batch_size, shuffle=False)
+
+        model.eval()
+        y_pred_list = []
+        y_test_list = []
+
+        with torch.no_grad():
+            for X_batch, y_batch in test_loader:
+                outputs = model(X_batch)
+                _, predicted = torch.max(outputs.data, 1)
+                y_pred_list.extend(predicted.tolist())
+                y_test_list.extend(y_batch.tolist())
+
+        y_pred = np.array(y_pred_list)
+        y_test = np.array(y_test_list)
+        accuracy = accuracy_score(y_test, y_pred)
+        return y_pred, accuracy
+
+if __name__ == '__main__':
+    la = LocationAnalyzer(r"C:\Users\sk002\Downloads\138362.csv")
+    df, meaningful_df = la.run_analysis()
+
+    test_idx = int(len(df) * 0.8)
+    df_train = df.iloc[:test_idx]
+    df_test = df.iloc[test_idx:]
+
+    # 파라미터 설정
+    seq_len = 30
+    steps = 30
+    single_output = True
+    lstm_params = {
+        "seq_len": seq_len,
+        "epochs": 30,
+        "patience": 30,
+        "learning_rate": 0.03,
+        "hidden_dim": 64,
+        "layer_dim": 2,
+        "dropout_prob": 0,
+        "batch_size": 32,
+        "validation_split": 0.3,
+    }
+
+    lstm_model = LSTMModel(class_num=len(df['y'].unique()))
+    trained_model = lstm_model.fit_lstm(
+        df=df_train,
+        steps=steps,
+        hidden_dim=lstm_params["hidden_dim"],
+        layer_dim=lstm_params["layer_dim"],
+        dropout_prob=lstm_params["dropout_prob"],
+        seq_len=seq_len,
+        single_output=single_output,
+        epochs=lstm_params["epochs"],
+        batch_size=lstm_params["batch_size"],
+        validation_split=lstm_params["validation_split"],
+        learning_rate=lstm_params["learning_rate"]
+    )
+
+    y_pred, acc = lstm_model.pred(df=df_test, model=trained_model, steps=steps, seq_len=seq_len, single_output=single_output, batch_size=lstm_params["batch_size"])
+
+    print(y_pred)
+    print(f"acc : {acc}")
\ No newline at end of file