Merge branch 'develop'

tukcomCD2024 · Jun 20, 2024 · ef98c7b · ef98c7b
2 parents 8b06732 + 379eeba
commit ef98c7b
Show file tree

Hide file tree

Showing 153 changed files with 9,173 additions and 2,365 deletions.
diff --git a/ai/LocationAnalyzer.py b/ai/LocationAnalyzer.py
@@ -1,127 +1,190 @@
-from pyclustering.cluster.gmeans import gmeans
-from collections import Counter
 import numpy as np
 import pandas as pd
+import json
+import math
+from scipy.spatial import distance
+from sklearn.metrics.pairwise import haversine_distances
+from pyclustering.cluster.gmeans import gmeans
+from collections import Counter
 import warnings
 
 warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거
 pd.set_option('mode.chained_assignment', None)
 
 class LocationAnalyzer:
-    def __init__(self, filename):
-        self.df = self.fileReader(filename)
-
-    # 파일 읽기
-    # 데이터 예시 (39.984702,116.318417,0,492,39744.1201851852,2008-10-23,02:53:04)
-    # (위도, 경도, 0, 고도, 1899년 이후 경과한 시간, 날짜, 시간)
-    def fileReader(self, filename):
-
-        latitude = []   # 위도
-        longitude = []  # 경도
-        date = []       # 날짜
-        time = []       # 시간
-
-        with open(filename, 'r') as file:
-            data = file.read()
-
-        # 데이터에 불필요한 부분 제거
-        # 추후 데이터 형식에 따라 수정 필요 *
-        # data = data.split('\n')[:-1]
-        data = data.split('\n')[6:-1]
-        for i in range(len(data)):
-            line = data[i].split(',')
-            latitude.append(line[0])    # 위도
-            longitude.append(line[1])   # 경도
-            #date.append(line[2])        # 날짜
-            #time.append(line[3])        # 시간
-            date.append(line[5])
-            time.append(line[6])
-        df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})
+    def __init__(self, csv_path) -> None:
+        self.df = pd.DataFrame()
+        self.fileReader(csv_path)
 
+    def convert_day_to_number(self, day):
+        weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+        return weekdays.index(day)
 
-        df['latitude'] = df['latitude'].astype(float)
-        df['longitude'] = df['longitude'].astype(float)
-        df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
-        df['datetime'] = df['datetime'].dt.floor('T')
-        # 시간대와 요일 추가
-        # 시간대 형식 : f00t04 f20t24
-        # 4시간 단위로 분리
-        df['hour_block'] = 'f' + ((df['datetime'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((df['datetime'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
-        df['day_of_week'] = df['datetime'].dt.day_name()
-        df = df.drop(['date', 'time'], axis=1)
-        df = df.drop_duplicates(['datetime'], ignore_index=True)
-
-        return df
-
-    # 의미장소 추출
-    def gmeansFit(self, df):
-        # 두 열을 선택하고 넘파이 배열로 변환
-        selectedColumns = ['latitude', 'longitude']
-        resultList = df[selectedColumns].values.tolist()    # 리스트로 변환
+    def custom_parser(self, date_string):
+        return pd.to_datetime(date_string, format='%Y-%m-%d')  # 날짜 형식에 맞게 지정
 
-        gmeansInstance = gmeans(resultList).process()       # 클러스터링
+    def fileReader(self, csv_path):
+        data = pd.read_csv(csv_path, parse_dates=['date'], date_parser=self.custom_parser)
+        index = list(range(len(data)))
+        data.index = index
+        self.df = data[['date', 'time', 'latitude', 'longitude', 'user_status']]
 
-        centers = gmeansInstance.get_centers()              # 클러스터의 중심 (의미장소)
-        clusters = gmeansInstance.get_clusters()            # 분류된 클러스터
+        self.df['date'] = pd.to_datetime(self.df['date'], format='%Y-%m-%d')
+        self.df['time'] = pd.to_datetime(self.df['time'], format='%H:%M:%S')
 
+        self.df['hour_block'] = 'f' + ((self.df['time'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((self.df['time'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
+        self.df['day_of_week'] = self.df['date'].dt.day_name()
 
-        return clusters, centers
-
-    # 호출 함수
-    def gmeansFunc(self):
+        new_data = []
+        for item in self.df['hour_block']:
+            num = int(item[1:-3])
+            new_data.append(num)
 
+        self.df['hour_block'] = new_data
+        self.df['day_of_week'] = self.df['day_of_week'].apply(self.convert_day_to_number)
+        self.df = self.df.drop(['date', 'time'], axis=1)
 
-        clusters, centers = self.gmeansFit(self.df)
+    def gmeans_fit(self):
+        # 두 열을 선택하고 넘파이 배열로 변환
+        selected_columns = ['latitude', 'longitude']
+        result_list = self.df[selected_columns].values.tolist()
 
-        data_df = pd.DataFrame({"clusters":clusters, "centers":centers})
+        gmeans_instance = gmeans(result_list).process()
+
+        centers = gmeans_instance.get_centers()
+        clusters = gmeans_instance.get_clusters()
 
+        return clusters, centers
+
+    def gmeans_func(self):
+        clusters, centers = self.gmeans_fit()
+
+        data_df = pd.DataFrame({"clusters": clusters, "centers": centers})
+
         for k in range(len(data_df.clusters)):
-            if (len(data_df.clusters[k]) < 10):
+            if len(data_df.clusters[k]) < 10:
                 data_df.drop(index=k, inplace=True)
         data_df = data_df.sort_index(axis=1)
         data_df = data_df.reset_index(drop=True)
-
-        self.df['clusterNo'] = -1
+        
+        self.df['cluster_no'] = -1
         for i in range(len(data_df)):
             for j in range(len(data_df['clusters'].iloc[i])):
                 k = data_df['clusters'].iloc[i][j]
-                self.df['clusterNo'].iloc[k] = i
-
-        self.df = self.df[self.df['clusterNo'] != -1]
-
-
+                self.df['cluster_no'].iloc[k] = i
+
         data_df['hour_block'] = 0
         data_df['day_of_week'] = 0
-        for i in range(max(self.df['clusterNo'])+1):
-
-            counter = Counter(self.df[self.df['clusterNo'] == i]['hour_block'])
+        for i in range(max(self.df['cluster_no']) + 1):
+            counter = Counter(self.df[self.df['cluster_no'] == i]['hour_block'])
             most_hour_value = counter.most_common(1)[0][0]
 
-            counter = Counter(self.df[self.df['clusterNo'] == i]['day_of_week'])
+            counter = Counter(self.df[self.df['cluster_no'] == i]['day_of_week'])
             most_day_value = counter.most_common(1)[0][0]
 
             data_df['hour_block'].iloc[i] = most_hour_value
             data_df['day_of_week'].iloc[i] = most_day_value
 
-        data_list = data_df.values.tolist()
-        return data_list
+        data_df[['latitude', 'longitude']] = data_df['centers'].apply(lambda x: pd.Series(x))
+        data_df.drop('centers', axis=1, inplace=True)
+        data_df = data_df[['latitude', 'longitude', 'clusters', 'hour_block', 'day_of_week']]
+
+        meaningful_df = data_df[['latitude', 'longitude', 'hour_block', 'day_of_week']]
+        meaningful_df[['latitude', 'longitude']] = meaningful_df[['latitude', 'longitude']].round(4)
+        meaningful_df = meaningful_df.drop_duplicates(['latitude', 'longitude'], keep='first', ignore_index=True)
+
+        return meaningful_df
+
+    def haversine(self, lat1, lon1, lat2, lon2):
+        R = 6371000.0  # 지구 반지름 (미터)
+        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+
+        a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
+        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
+        distance = R * c
+        return distance
+
+    def calculate_bearing(self, lat1, lon1, lat2, lon2):
+        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
+        dlon = lon2 - lon1
+        x = math.sin(dlon) * math.cos(lat2)
+        y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon)
+        initial_bearing = math.atan2(x, y)
+        initial_bearing = math.degrees(initial_bearing)
+        compass_bearing = (initial_bearing + 360) % 360
+        return compass_bearing
 
+    def add_movement_features(self):
+        self.df['next_lat'] = self.df['latitude'].shift(1)
+        self.df['next_lon'] = self.df['longitude'].shift(1)
 
+        self.df['distance'] = self.df.apply(lambda row: self.haversine(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
+        self.df['bearing'] = self.df.apply(lambda row: self.calculate_bearing(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
 
-if __name__ == '__main__':
-    # 파일 경로 가져오기
-    filePath = r"C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20081202160051.txt"
-    la = LocationAnalyzer(filePath)
+        self.df = self.df.drop(['next_lat', 'next_lon'], axis=1)
+        self.df = self.df.fillna(0)
+
+    def map_to_meaningful_places(self, meaningful_df):
+        y, m_hour_block, m_day_of_week = [], [], []
+        for i in range(len(self.df)):
+            current_location = (self.df['latitude'].iloc[i], self.df['longitude'].iloc[i])
+            min_distance = float('inf')
+
+            for j in range(len(meaningful_df)):
+                place_location = (meaningful_df['latitude'].iloc[j], meaningful_df['longitude'].iloc[j])
+                dist = distance.euclidean(current_location, place_location)
+                if dist < min_distance:
+                    min_distance = dist
+                    min_distance_index = j
+
+            y.append(min_distance_index)
+            m_hour_block.append(meaningful_df['hour_block'].iloc[min_distance_index])
+            m_day_of_week.append(meaningful_df['day_of_week'].iloc[min_distance_index])
+
+        self.df['m_hour_block'] = m_hour_block
+        self.df['m_day_of_week'] = m_day_of_week
+        self.df['y'] = y
+
+    def calculate_additional_features(self):
+        self.df['speed'] = self.df['distance'] / (self.df['hour_block'] / 4 + 1)
+        self.df['lat_rate_change'] = self.df['latitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
+        self.df['lon_rate_change'] = self.df['longitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
+
+        daily_variability = self.df.groupby('day_of_week')[['latitude', 'longitude']].std().add_suffix('_daily_var')
+        hourly_variability = self.df.groupby('hour_block')[['latitude', 'longitude']].std().add_suffix('_hourly_var')
+        self.df = self.df.join(daily_variability, on='day_of_week')
+        self.df = self.df.join(hourly_variability, on='hour_block')
 
-    data = la.gmeansFunc()
+        self.df['max_travel_range'] = self.df.groupby('hour_block')['distance'].transform('max')
+
+        self.df['movement_direction'] = self.df['bearing'].apply(lambda x: 0 if x < 180 else 1)
+        self.df = self.df.fillna(0)
+
+
+    def run_analysis(self):
+        meaningful_df = self.gmeans_func()
+        self.add_movement_features()
+        self.map_to_meaningful_places(meaningful_df)
+        self.calculate_additional_features()
+
+        y = self.df['y']
+        self.df = self.df.drop(['y'], axis=1)
+        self.df['y'] = y
+
+        return self.df, meaningful_df
 
+
+
+if __name__ == '__main__':
+
+    # csv 파일 가져오기
+    # 필요한 데이터 'date', 'time', 'latitude', 'longitude', 'user_status' : 날짜 시간 위도 경도 이동상태
+    csv_path = r"C:\Users\sk002\Downloads\138362.csv"
+    la = LocationAnalyzer(csv_path)
+
+    df, meaningful_df = la.run_analysis()
 
-    print(data[1])
-    print(data[1][0])
-    print(data[1][0][0]) # latitude
-    print(data[1][0][1]) # longitude
-    print(data[1][2]) # time
-    print(data[1][3]) # w
-
-    print(type(data[1][2]))
-    print(type(data[1][3]))
+    print(df)
+    print(meaningful_df)