From c7153a4a9c18fb6707571647b3cf04dfac3ddc0f Mon Sep 17 00:00:00 2001 From: epe12345 <132982056+epe12345@users.noreply.github.com> Date: Sun, 26 May 2024 00:21:11 +0900 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=EC=9C=84=EC=B9=98=EC=98=88?= =?UTF-8?q?=EC=B8=A1,=20=EC=9D=98=EB=AF=B8=EC=9E=A5=EC=86=8C=20=EA=B8=B0?= =?UTF-8?q?=EB=8A=A5=20=EB=B6=84=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ai/LocationAnalyzer.py | 237 ++++++++++++++++++++++++++--------------- ai/LocationPredict.py | 182 ++----------------------------- 2 files changed, 158 insertions(+), 261 deletions(-) diff --git a/ai/LocationAnalyzer.py b/ai/LocationAnalyzer.py index ed841d80..162b25ed 100644 --- a/ai/LocationAnalyzer.py +++ b/ai/LocationAnalyzer.py @@ -1,127 +1,190 @@ -from pyclustering.cluster.gmeans import gmeans -from collections import Counter import numpy as np import pandas as pd +import json +import math +from scipy.spatial import distance +from sklearn.metrics.pairwise import haversine_distances +from pyclustering.cluster.gmeans import gmeans +from collections import Counter import warnings warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거 pd.set_option('mode.chained_assignment', None) class LocationAnalyzer: - def __init__(self, filename): - self.df = self.fileReader(filename) - - # 파일 읽기 - # 데이터 예시 (39.984702,116.318417,0,492,39744.1201851852,2008-10-23,02:53:04) - # (위도, 경도, 0, 고도, 1899년 이후 경과한 시간, 날짜, 시간) - def fileReader(self, filename): - - latitude = [] # 위도 - longitude = [] # 경도 - date = [] # 날짜 - time = [] # 시간 - - with open(filename, 'r') as file: - data = file.read() - - # 데이터에 불필요한 부분 제거 - # 추후 데이터 형식에 따라 수정 필요 * - # data = data.split('\n')[:-1] - data = data.split('\n')[6:-1] - for i in range(len(data)): - line = data[i].split(',') - latitude.append(line[0]) # 위도 - longitude.append(line[1]) # 경도 - #date.append(line[2]) # 날짜 - #time.append(line[3]) # 시간 - date.append(line[5]) - time.append(line[6]) - df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time}) + def __init__(self, csv_path) -> None: + self.df = pd.DataFrame() + self.fileReader(csv_path) + def convert_day_to_number(self, day): + weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] + return weekdays.index(day) - df['latitude'] = df['latitude'].astype(float) - df['longitude'] = df['longitude'].astype(float) - df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S') - df['datetime'] = df['datetime'].dt.floor('T') - # 시간대와 요일 추가 - # 시간대 형식 : f00t04 f20t24 - # 4시간 단위로 분리 - df['hour_block'] = 'f' + ((df['datetime'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((df['datetime'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2) - df['day_of_week'] = df['datetime'].dt.day_name() - df = df.drop(['date', 'time'], axis=1) - df = df.drop_duplicates(['datetime'], ignore_index=True) - - return df - - # 의미장소 추출 - def gmeansFit(self, df): - # 두 열을 선택하고 넘파이 배열로 변환 - selectedColumns = ['latitude', 'longitude'] - resultList = df[selectedColumns].values.tolist() # 리스트로 변환 + def custom_parser(self, date_string): + return pd.to_datetime(date_string, format='%Y-%m-%d') # 날짜 형식에 맞게 지정 - gmeansInstance = gmeans(resultList).process() # 클러스터링 + def fileReader(self, csv_path): + data = pd.read_csv(csv_path, parse_dates=['date'], date_parser=self.custom_parser) + index = list(range(len(data))) + data.index = index + self.df = data[['date', 'time', 'latitude', 'longitude', 'user_status']] - centers = gmeansInstance.get_centers() # 클러스터의 중심 (의미장소) - clusters = gmeansInstance.get_clusters() # 분류된 클러스터 + self.df['date'] = pd.to_datetime(self.df['date'], format='%Y-%m-%d') + self.df['time'] = pd.to_datetime(self.df['time'], format='%H:%M:%S') + self.df['hour_block'] = 'f' + ((self.df['time'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((self.df['time'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2) + self.df['day_of_week'] = self.df['date'].dt.day_name() - return clusters, centers - - # 호출 함수 - def gmeansFunc(self): + new_data = [] + for item in self.df['hour_block']: + num = int(item[1:-3]) + new_data.append(num) + self.df['hour_block'] = new_data + self.df['day_of_week'] = self.df['day_of_week'].apply(self.convert_day_to_number) + self.df = self.df.drop(['date', 'time'], axis=1) - clusters, centers = self.gmeansFit(self.df) + def gmeans_fit(self): + # 두 열을 선택하고 넘파이 배열로 변환 + selected_columns = ['latitude', 'longitude'] + result_list = self.df[selected_columns].values.tolist() - data_df = pd.DataFrame({"clusters":clusters, "centers":centers}) + gmeans_instance = gmeans(result_list).process() + + centers = gmeans_instance.get_centers() + clusters = gmeans_instance.get_clusters() + return clusters, centers + + def gmeans_func(self): + clusters, centers = self.gmeans_fit() + + data_df = pd.DataFrame({"clusters": clusters, "centers": centers}) + for k in range(len(data_df.clusters)): - if (len(data_df.clusters[k]) < 10): + if len(data_df.clusters[k]) < 10: data_df.drop(index=k, inplace=True) data_df = data_df.sort_index(axis=1) data_df = data_df.reset_index(drop=True) - - self.df['clusterNo'] = -1 + + self.df['cluster_no'] = -1 for i in range(len(data_df)): for j in range(len(data_df['clusters'].iloc[i])): k = data_df['clusters'].iloc[i][j] - self.df['clusterNo'].iloc[k] = i - - self.df = self.df[self.df['clusterNo'] != -1] - - + self.df['cluster_no'].iloc[k] = i + data_df['hour_block'] = 0 data_df['day_of_week'] = 0 - for i in range(max(self.df['clusterNo'])+1): - - counter = Counter(self.df[self.df['clusterNo'] == i]['hour_block']) + for i in range(max(self.df['cluster_no']) + 1): + counter = Counter(self.df[self.df['cluster_no'] == i]['hour_block']) most_hour_value = counter.most_common(1)[0][0] - counter = Counter(self.df[self.df['clusterNo'] == i]['day_of_week']) + counter = Counter(self.df[self.df['cluster_no'] == i]['day_of_week']) most_day_value = counter.most_common(1)[0][0] data_df['hour_block'].iloc[i] = most_hour_value data_df['day_of_week'].iloc[i] = most_day_value - data_list = data_df.values.tolist() - return data_list + data_df[['latitude', 'longitude']] = data_df['centers'].apply(lambda x: pd.Series(x)) + data_df.drop('centers', axis=1, inplace=True) + data_df = data_df[['latitude', 'longitude', 'clusters', 'hour_block', 'day_of_week']] + + meaningful_df = data_df[['latitude', 'longitude', 'hour_block', 'day_of_week']] + meaningful_df[['latitude', 'longitude']] = meaningful_df[['latitude', 'longitude']].round(4) + meaningful_df = meaningful_df.drop_duplicates(['latitude', 'longitude'], keep='first', ignore_index=True) + + return meaningful_df + + def haversine(self, lat1, lon1, lat2, lon2): + R = 6371000.0 # 지구 반지름 (미터) + lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2]) + dlat = lat2 - lat1 + dlon = lon2 - lon1 + + a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2 + c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) + distance = R * c + return distance + + def calculate_bearing(self, lat1, lon1, lat2, lon2): + lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2]) + dlon = lon2 - lon1 + x = math.sin(dlon) * math.cos(lat2) + y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon) + initial_bearing = math.atan2(x, y) + initial_bearing = math.degrees(initial_bearing) + compass_bearing = (initial_bearing + 360) % 360 + return compass_bearing + def add_movement_features(self): + self.df['next_lat'] = self.df['latitude'].shift(1) + self.df['next_lon'] = self.df['longitude'].shift(1) + self.df['distance'] = self.df.apply(lambda row: self.haversine(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1) + self.df['bearing'] = self.df.apply(lambda row: self.calculate_bearing(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1) -if __name__ == '__main__': - # 파일 경로 가져오기 - filePath = r"C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20081202160051.txt" - la = LocationAnalyzer(filePath) + self.df = self.df.drop(['next_lat', 'next_lon'], axis=1) + self.df = self.df.fillna(0) + + def map_to_meaningful_places(self, meaningful_df): + y, m_hour_block, m_day_of_week = [], [], [] + for i in range(len(self.df)): + current_location = (self.df['latitude'].iloc[i], self.df['longitude'].iloc[i]) + min_distance = float('inf') + + for j in range(len(meaningful_df)): + place_location = (meaningful_df['latitude'].iloc[j], meaningful_df['longitude'].iloc[j]) + dist = distance.euclidean(current_location, place_location) + if dist < min_distance: + min_distance = dist + min_distance_index = j + + y.append(min_distance_index) + m_hour_block.append(meaningful_df['hour_block'].iloc[min_distance_index]) + m_day_of_week.append(meaningful_df['day_of_week'].iloc[min_distance_index]) + + self.df['m_hour_block'] = m_hour_block + self.df['m_day_of_week'] = m_day_of_week + self.df['y'] = y + + def calculate_additional_features(self): + self.df['speed'] = self.df['distance'] / (self.df['hour_block'] / 4 + 1) + self.df['lat_rate_change'] = self.df['latitude'].diff() / self.df['hour_block'].diff().replace(0, 1) + self.df['lon_rate_change'] = self.df['longitude'].diff() / self.df['hour_block'].diff().replace(0, 1) + + daily_variability = self.df.groupby('day_of_week')[['latitude', 'longitude']].std().add_suffix('_daily_var') + hourly_variability = self.df.groupby('hour_block')[['latitude', 'longitude']].std().add_suffix('_hourly_var') + self.df = self.df.join(daily_variability, on='day_of_week') + self.df = self.df.join(hourly_variability, on='hour_block') - data = la.gmeansFunc() + self.df['max_travel_range'] = self.df.groupby('hour_block')['distance'].transform('max') + + self.df['movement_direction'] = self.df['bearing'].apply(lambda x: 0 if x < 180 else 1) + self.df = self.df.fillna(0) + + + def run_analysis(self): + meaningful_df = self.gmeans_func() + self.add_movement_features() + self.map_to_meaningful_places(meaningful_df) + self.calculate_additional_features() + + y = self.df['y'] + self.df = self.df.drop(['y'], axis=1) + self.df['y'] = y + + return self.df, meaningful_df + + +if __name__ == '__main__': + + # csv 파일 가져오기 + # 필요한 데이터 'date', 'time', 'latitude', 'longitude', 'user_status' : 날짜 시간 위도 경도 이동상태 + csv_path = r"C:\Users\sk002\Downloads\138362.csv" + la = LocationAnalyzer(csv_path) + + df, meaningful_df = la.run_analysis() - print(data[1]) - print(data[1][0]) - print(data[1][0][0]) # latitude - print(data[1][0][1]) # longitude - print(data[1][2]) # time - print(data[1][3]) # w - - print(type(data[1][2])) - print(type(data[1][3])) + print(df) + print(meaningful_df) \ No newline at end of file diff --git a/ai/LocationPredict.py b/ai/LocationPredict.py index 808c82af..414b6dc4 100644 --- a/ai/LocationPredict.py +++ b/ai/LocationPredict.py @@ -4,7 +4,6 @@ import tensorflow as tf import json import math -import folium import tensorflow.keras.backend as K from tensorflow import keras from scipy.spatial import distance @@ -22,6 +21,7 @@ from tensorflow.keras.utils import to_categorical from pyclustering.cluster.gmeans import gmeans from collections import Counter +from LocationAnalyzer import LocationAnalyzer class ForecastLSTMClassification: @@ -312,190 +312,24 @@ def pred(self, return y_pred -# 전처리 -class Preprocessing: - def __init__(self, csv_path) -> None: - self.df = pd.DataFrame() - self.fileReader(csv_path) - - def convert_day_to_number(self, day): - weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] - return weekdays.index(day) - - def custom_parser(self, date_string): - return pd.to_datetime(date_string, format='%Y-%m-%d') # 날짜 형식에 맞게 지정 - - def fileReader(self, csv_path): - data = pd.read_csv(csv_path, parse_dates=['date'], date_parser=self.custom_parser) - index = list(range(len(data))) - data.index = index - self.df = data[['date', 'time', 'latitude', 'longitude', 'user_status']] - - self.df['date'] = pd.to_datetime(self.df['date'], format='%Y-%m-%d') - self.df['time'] = pd.to_datetime(self.df['time'], format='%H:%M:%S') - - self.df['hour_block'] = 'f' + ((self.df['time'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((self.df['time'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2) - self.df['day_of_week'] = self.df['date'].dt.day_name() - - new_data = [] - for item in self.df['hour_block']: - num = int(item[1:-3]) - new_data.append(num) - - self.df['hour_block'] = new_data - self.df['day_of_week'] = self.df['day_of_week'].apply(self.convert_day_to_number) - self.df = self.df.drop(['date', 'time'], axis=1) - - def gmeans_fit(self): - # 두 열을 선택하고 넘파이 배열로 변환 - selected_columns = ['latitude', 'longitude'] - result_list = self.df[selected_columns].values.tolist() - - gmeans_instance = gmeans(result_list).process() - - centers = gmeans_instance.get_centers() - clusters = gmeans_instance.get_clusters() - - return clusters, centers - - def gmeans_func(self): - clusters, centers = self.gmeans_fit() - - data_df = pd.DataFrame({"clusters": clusters, "centers": centers}) - - for k in range(len(data_df.clusters)): - if len(data_df.clusters[k]) < 10: - data_df.drop(index=k, inplace=True) - data_df = data_df.sort_index(axis=1) - data_df = data_df.reset_index(drop=True) - - self.df['cluster_no'] = -1 - for i in range(len(data_df)): - for j in range(len(data_df['clusters'].iloc[i])): - k = data_df['clusters'].iloc[i][j] - self.df['cluster_no'].iloc[k] = i - - data_df['hour_block'] = 0 - data_df['day_of_week'] = 0 - for i in range(max(self.df['cluster_no']) + 1): - counter = Counter(self.df[self.df['cluster_no'] == i]['hour_block']) - most_hour_value = counter.most_common(1)[0][0] - - counter = Counter(self.df[self.df['cluster_no'] == i]['day_of_week']) - most_day_value = counter.most_common(1)[0][0] - - data_df['hour_block'].iloc[i] = most_hour_value - data_df['day_of_week'].iloc[i] = most_day_value - - data_df[['latitude', 'longitude']] = data_df['centers'].apply(lambda x: pd.Series(x)) - data_df.drop('centers', axis=1, inplace=True) - data_df = data_df[['latitude', 'longitude', 'clusters', 'hour_block', 'day_of_week']] - - meaningful_df = data_df[['latitude', 'longitude', 'hour_block', 'day_of_week']] - meaningful_df[['latitude', 'longitude']] = meaningful_df[['latitude', 'longitude']].round(4) - meaningful_df = meaningful_df.drop_duplicates(['latitude', 'longitude'], keep='first', ignore_index=True) - - return meaningful_df - - def haversine(self, lat1, lon1, lat2, lon2): - R = 6371000.0 # 지구 반지름 (미터) - lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2]) - dlat = lat2 - lat1 - dlon = lon2 - lon1 - - a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2 - c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) - distance = R * c - return distance - - def calculate_bearing(self, lat1, lon1, lat2, lon2): - lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2]) - dlon = lon2 - lon1 - x = math.sin(dlon) * math.cos(lat2) - y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon) - initial_bearing = math.atan2(x, y) - initial_bearing = math.degrees(initial_bearing) - compass_bearing = (initial_bearing + 360) % 360 - return compass_bearing - - def add_movement_features(self): - self.df['next_lat'] = self.df['latitude'].shift(1) - self.df['next_lon'] = self.df['longitude'].shift(1) - - self.df['distance'] = self.df.apply(lambda row: self.haversine(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1) - self.df['bearing'] = self.df.apply(lambda row: self.calculate_bearing(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1) - - self.df = self.df.drop(['next_lat', 'next_lon'], axis=1) - self.df = self.df.fillna(0) - - def map_to_meaningful_places(self, meaningful_df): - y, m_hour_block, m_day_of_week = [], [], [] - for i in range(len(self.df)): - current_location = (self.df['latitude'].iloc[i], self.df['longitude'].iloc[i]) - min_distance = float('inf') - - for j in range(len(meaningful_df)): - place_location = (meaningful_df['latitude'].iloc[j], meaningful_df['longitude'].iloc[j]) - dist = distance.euclidean(current_location, place_location) - if dist < min_distance: - min_distance = dist - min_distance_index = j - - y.append(min_distance_index) - m_hour_block.append(meaningful_df['hour_block'].iloc[min_distance_index]) - m_day_of_week.append(meaningful_df['day_of_week'].iloc[min_distance_index]) - - self.df['m_hour_block'] = m_hour_block - self.df['m_day_of_week'] = m_day_of_week - self.df['y'] = y - - def calculate_additional_features(self): - self.df['speed'] = self.df['distance'] / (self.df['hour_block'] / 4 + 1) - self.df['lat_rate_change'] = self.df['latitude'].diff() / self.df['hour_block'].diff().replace(0, 1) - self.df['lon_rate_change'] = self.df['longitude'].diff() / self.df['hour_block'].diff().replace(0, 1) - - daily_variability = self.df.groupby('day_of_week')[['latitude', 'longitude']].std().add_suffix('_daily_var') - hourly_variability = self.df.groupby('hour_block')[['latitude', 'longitude']].std().add_suffix('_hourly_var') - self.df = self.df.join(daily_variability, on='day_of_week') - self.df = self.df.join(hourly_variability, on='hour_block') - - self.df['max_travel_range'] = self.df.groupby('hour_block')['distance'].transform('max') - - self.df['movement_direction'] = self.df['bearing'].apply(lambda x: 0 if x < 180 else 1) - self.df = self.df.fillna(0) - - - def run_analysis(self): - meaningful_df = self.gmeans_func() - self.add_movement_features() - self.map_to_meaningful_places(meaningful_df) - self.calculate_additional_features() - - y = self.df['y'] - self.df = self.df.drop(['y'], axis=1) - self.df['y'] = y - - return self.df if __name__ =='__main__': - # 전처리 - csv_path = r"C:\Users\sk002\Downloads\138362.csv" - pr = Preprocessing(csv_path) - df = pr.run_analysis() + la = LocationAnalyzer(r"C:\Users\sk002\Downloads\138362.csv") + df, meaningful_df = la.run_analysis() test_idx = int(len(df) * 0.8) df_train = df.iloc[:test_idx] df_test = df.iloc[test_idx:] # 파라미터 설정 - seq_len = 150 # 150개의 데이터를 feature로 사용 - steps = 150 # 향후 150개 뒤의 y를 예측 - single_output = False + seq_len = 15 # 150개의 데이터를 feature로 사용 + steps = 15 # 향후 150개 뒤의 y를 예측 + single_output = True metrics = ["accuracy"] # 모델 성능 지표 lstm_params = { "seq_len": seq_len, - "epochs": 100, # epochs 반복 횟수 + "epochs": 10, # epochs 반복 횟수 "patience": 30, # early stopping 조건 "steps_per_epoch": 5, # 1 epochs 시 dataset을 5개로 분할하여 학습 "learning_rate": 0.03, @@ -505,7 +339,7 @@ def run_analysis(self): "validation_split": 0.3, # 검증 데이터셋 30% } fl = ForecastLSTMClassification(class_num=len(df['y'].unique())) - model = fl.fit_lstm( + fl.fit_lstm( df=df_train, steps=steps, single_output=single_output, From 0fb0dfd8b624bb5dc44be9d043f876440c412c66 Mon Sep 17 00:00:00 2001 From: epe12345 <132982056+epe12345@users.noreply.github.com> Date: Fri, 7 Jun 2024 02:32:53 +0900 Subject: [PATCH 2/2] =?UTF-8?q?feat:=20=EC=9C=84=EC=B9=98=20=EC=98=88?= =?UTF-8?q?=EC=B8=A1=20=EB=AA=A8=EB=8D=B8=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - huggingface의 peft 적용을 위해 tensorflow에서 pytorch로 변경 --- ai/LocationPredictUsingPEFT.py | 180 +++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 ai/LocationPredictUsingPEFT.py diff --git a/ai/LocationPredictUsingPEFT.py b/ai/LocationPredictUsingPEFT.py new file mode 100644 index 00000000..e84b15c6 --- /dev/null +++ b/ai/LocationPredictUsingPEFT.py @@ -0,0 +1,180 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset, random_split +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import accuracy_score +from LocationAnalyzer import LocationAnalyzer + +class ForecastLSTMClassification(nn.Module): + def __init__(self, class_num: int, input_dim: int, hidden_dim: int, layer_dim: int, output_dim: int, dropout_prob: float = 0.2): + super(ForecastLSTMClassification, self).__init__() + self.hidden_dim = hidden_dim + self.layer_dim = layer_dim + + self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob) + self.fc = nn.Linear(hidden_dim, output_dim) + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device) + c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device) + + out, _ = self.lstm(x, (h0, c0)) + out = self.fc(out[:, -1, :]) + out = self.softmax(out) + return out + +class LSTMModel: + def __init__(self, class_num: int, random_seed: int = 1234): + self.random_seed = random_seed + self.class_num = class_num + torch.manual_seed(random_seed) + np.random.seed(random_seed) + + def reshape_dataset(self, df: pd.DataFrame) -> np.array: + dataset = df.values.reshape(df.shape) + return dataset + + def split_sequences(self, dataset: np.array, seq_len: int, steps: int, single_output: bool) -> tuple: + X, y = [], [] + for i in range(len(dataset) - seq_len - steps + 1): + idx_in = i + seq_len + idx_out = idx_in + steps + + if idx_out > len(dataset): + break + + seq_x = dataset[i:idx_in, :-1] + seq_y = dataset[idx_in:idx_out, -1] + + X.append(seq_x) + y.append(seq_y[0] if single_output else seq_y) + + X = np.array(X) + y = np.array(y) + return X, y + + def split_train_valid_dataset(self, df: pd.DataFrame, seq_len: int, steps: int, single_output: bool, validation_split: float = 0.2) -> tuple: + dataset = self.reshape_dataset(df=df) + X, y = self.split_sequences(dataset=dataset, seq_len=seq_len, steps=steps, single_output=single_output) + + dataset_size = len(X) + train_size = int(dataset_size * (1-validation_split)) + valid_size = dataset_size - train_size + + X_train, y_train = torch.tensor(X[:train_size, :], dtype=torch.float32), torch.tensor(y[:train_size], dtype=torch.long) + X_val, y_val = torch.tensor(X[train_size:, :], dtype=torch.float32), torch.tensor(y[train_size:], dtype=torch.long) + + train_dataset = TensorDataset(X_train, y_train) + val_dataset = TensorDataset(X_val, y_val) + + return train_dataset, val_dataset + + def build_and_compile_lstm_model(self, seq_len: int, n_features: int, hidden_dim: int, layer_dim: int, dropout_prob: float = 0.2, learning_rate: float = 0.001): + model = ForecastLSTMClassification(self.class_num, n_features, hidden_dim, layer_dim, self.class_num, dropout_prob) + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + return model, criterion, optimizer + + def fit_lstm(self, df: pd.DataFrame, steps: int, hidden_dim: int, layer_dim: int, dropout_prob: float, seq_len: int, single_output: bool, epochs: int, batch_size: int, validation_split: float, learning_rate: float): + train_dataset, val_dataset = self.split_train_valid_dataset(df=df, seq_len=seq_len, steps=steps, single_output=single_output, validation_split=validation_split) + train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) + val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) + + model, criterion, optimizer = self.build_and_compile_lstm_model(seq_len=seq_len, n_features=train_dataset[0][0].shape[1], hidden_dim=hidden_dim, layer_dim=layer_dim, dropout_prob=dropout_prob, learning_rate=learning_rate) + + model.train() + for epoch in range(epochs): + for X_batch, y_batch in train_loader: + optimizer.zero_grad() + outputs = model(X_batch) + loss = criterion(outputs, y_batch.view(-1)) + loss.backward() + optimizer.step() + print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}') + + return model + + def forecast_validation_dataset(self, model, val_loader): + model.eval() + y_pred_list, y_val_list = [], [] + + with torch.no_grad(): + for X_batch, y_batch in val_loader: + outputs = model(X_batch) + _, predicted = torch.max(outputs.data, 1) + y_pred_list.extend(predicted.tolist()) + y_val_list.extend(y_batch.tolist()) + return pd.DataFrame({"y": y_val_list, "yhat": y_pred_list}) + + def pred(self, df: pd.DataFrame, model, steps: int, seq_len: int, single_output: bool, batch_size: int): + dataset = self.reshape_dataset(df=df) + X_test, y_test = self.split_sequences(dataset=dataset, seq_len=seq_len, steps=steps, single_output=single_output) + + X_test_tensor = torch.tensor(X_test, dtype=torch.float32) + y_test_tensor = torch.tensor(y_test, dtype=torch.long) + + test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=batch_size, shuffle=False) + + model.eval() + y_pred_list = [] + y_test_list = [] + + with torch.no_grad(): + for X_batch, y_batch in test_loader: + outputs = model(X_batch) + _, predicted = torch.max(outputs.data, 1) + y_pred_list.extend(predicted.tolist()) + y_test_list.extend(y_batch.tolist()) + + y_pred = np.array(y_pred_list) + y_test = np.array(y_test_list) + accuracy = accuracy_score(y_test, y_pred) + return y_pred, accuracy + +if __name__ == '__main__': + la = LocationAnalyzer(r"C:\Users\sk002\Downloads\138362.csv") + df, meaningful_df = la.run_analysis() + + test_idx = int(len(df) * 0.8) + df_train = df.iloc[:test_idx] + df_test = df.iloc[test_idx:] + + # 파라미터 설정 + seq_len = 30 + steps = 30 + single_output = True + lstm_params = { + "seq_len": seq_len, + "epochs": 30, + "patience": 30, + "learning_rate": 0.03, + "hidden_dim": 64, + "layer_dim": 2, + "dropout_prob": 0, + "batch_size": 32, + "validation_split": 0.3, + } + + lstm_model = LSTMModel(class_num=len(df['y'].unique())) + trained_model = lstm_model.fit_lstm( + df=df_train, + steps=steps, + hidden_dim=lstm_params["hidden_dim"], + layer_dim=lstm_params["layer_dim"], + dropout_prob=lstm_params["dropout_prob"], + seq_len=seq_len, + single_output=single_output, + epochs=lstm_params["epochs"], + batch_size=lstm_params["batch_size"], + validation_split=lstm_params["validation_split"], + learning_rate=lstm_params["learning_rate"] + ) + + y_pred, acc = lstm_model.pred(df=df_test, model=trained_model, steps=steps, seq_len=seq_len, single_output=single_output, batch_size=lstm_params["batch_size"]) + + print(y_pred) + print(f"acc : {acc}") \ No newline at end of file