Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
kyujin0911 committed Jun 20, 2024
2 parents 8b06732 + 379eeba commit ef98c7b
Show file tree
Hide file tree
Showing 153 changed files with 9,173 additions and 2,365 deletions.
237 changes: 150 additions & 87 deletions ai/LocationAnalyzer.py
Original file line number Diff line number Diff line change
@@ -1,127 +1,190 @@
from pyclustering.cluster.gmeans import gmeans
from collections import Counter
import numpy as np
import pandas as pd
import json
import math
from scipy.spatial import distance
from sklearn.metrics.pairwise import haversine_distances
from pyclustering.cluster.gmeans import gmeans
from collections import Counter
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거
pd.set_option('mode.chained_assignment', None)

class LocationAnalyzer:
def __init__(self, filename):
self.df = self.fileReader(filename)

# 파일 읽기
# 데이터 예시 (39.984702,116.318417,0,492,39744.1201851852,2008-10-23,02:53:04)
# (위도, 경도, 0, 고도, 1899년 이후 경과한 시간, 날짜, 시간)
def fileReader(self, filename):

latitude = [] # 위도
longitude = [] # 경도
date = [] # 날짜
time = [] # 시간

with open(filename, 'r') as file:
data = file.read()

# 데이터에 불필요한 부분 제거
# 추후 데이터 형식에 따라 수정 필요 *
# data = data.split('\n')[:-1]
data = data.split('\n')[6:-1]
for i in range(len(data)):
line = data[i].split(',')
latitude.append(line[0]) # 위도
longitude.append(line[1]) # 경도
#date.append(line[2]) # 날짜
#time.append(line[3]) # 시간
date.append(line[5])
time.append(line[6])
df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})
def __init__(self, csv_path) -> None:
self.df = pd.DataFrame()
self.fileReader(csv_path)

def convert_day_to_number(self, day):
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
return weekdays.index(day)

df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
df['datetime'] = df['datetime'].dt.floor('T')
# 시간대와 요일 추가
# 시간대 형식 : f00t04 f20t24
# 4시간 단위로 분리
df['hour_block'] = 'f' + ((df['datetime'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((df['datetime'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
df['day_of_week'] = df['datetime'].dt.day_name()
df = df.drop(['date', 'time'], axis=1)
df = df.drop_duplicates(['datetime'], ignore_index=True)

return df

# 의미장소 추출
def gmeansFit(self, df):
# 두 열을 선택하고 넘파이 배열로 변환
selectedColumns = ['latitude', 'longitude']
resultList = df[selectedColumns].values.tolist() # 리스트로 변환
def custom_parser(self, date_string):
return pd.to_datetime(date_string, format='%Y-%m-%d') # 날짜 형식에 맞게 지정

gmeansInstance = gmeans(resultList).process() # 클러스터링
def fileReader(self, csv_path):
data = pd.read_csv(csv_path, parse_dates=['date'], date_parser=self.custom_parser)
index = list(range(len(data)))
data.index = index
self.df = data[['date', 'time', 'latitude', 'longitude', 'user_status']]

centers = gmeansInstance.get_centers() # 클러스터의 중심 (의미장소)
clusters = gmeansInstance.get_clusters() # 분류된 클러스터
self.df['date'] = pd.to_datetime(self.df['date'], format='%Y-%m-%d')
self.df['time'] = pd.to_datetime(self.df['time'], format='%H:%M:%S')

self.df['hour_block'] = 'f' + ((self.df['time'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((self.df['time'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
self.df['day_of_week'] = self.df['date'].dt.day_name()

return clusters, centers

# 호출 함수
def gmeansFunc(self):
new_data = []
for item in self.df['hour_block']:
num = int(item[1:-3])
new_data.append(num)

self.df['hour_block'] = new_data
self.df['day_of_week'] = self.df['day_of_week'].apply(self.convert_day_to_number)
self.df = self.df.drop(['date', 'time'], axis=1)

clusters, centers = self.gmeansFit(self.df)
def gmeans_fit(self):
# 두 열을 선택하고 넘파이 배열로 변환
selected_columns = ['latitude', 'longitude']
result_list = self.df[selected_columns].values.tolist()

data_df = pd.DataFrame({"clusters":clusters, "centers":centers})
gmeans_instance = gmeans(result_list).process()

centers = gmeans_instance.get_centers()
clusters = gmeans_instance.get_clusters()

return clusters, centers

def gmeans_func(self):
clusters, centers = self.gmeans_fit()

data_df = pd.DataFrame({"clusters": clusters, "centers": centers})

for k in range(len(data_df.clusters)):
if (len(data_df.clusters[k]) < 10):
if len(data_df.clusters[k]) < 10:
data_df.drop(index=k, inplace=True)
data_df = data_df.sort_index(axis=1)
data_df = data_df.reset_index(drop=True)

self.df['clusterNo'] = -1
self.df['cluster_no'] = -1
for i in range(len(data_df)):
for j in range(len(data_df['clusters'].iloc[i])):
k = data_df['clusters'].iloc[i][j]
self.df['clusterNo'].iloc[k] = i

self.df = self.df[self.df['clusterNo'] != -1]


self.df['cluster_no'].iloc[k] = i

data_df['hour_block'] = 0
data_df['day_of_week'] = 0
for i in range(max(self.df['clusterNo'])+1):

counter = Counter(self.df[self.df['clusterNo'] == i]['hour_block'])
for i in range(max(self.df['cluster_no']) + 1):
counter = Counter(self.df[self.df['cluster_no'] == i]['hour_block'])
most_hour_value = counter.most_common(1)[0][0]

counter = Counter(self.df[self.df['clusterNo'] == i]['day_of_week'])
counter = Counter(self.df[self.df['cluster_no'] == i]['day_of_week'])
most_day_value = counter.most_common(1)[0][0]

data_df['hour_block'].iloc[i] = most_hour_value
data_df['day_of_week'].iloc[i] = most_day_value

data_list = data_df.values.tolist()
return data_list
data_df[['latitude', 'longitude']] = data_df['centers'].apply(lambda x: pd.Series(x))
data_df.drop('centers', axis=1, inplace=True)
data_df = data_df[['latitude', 'longitude', 'clusters', 'hour_block', 'day_of_week']]

meaningful_df = data_df[['latitude', 'longitude', 'hour_block', 'day_of_week']]
meaningful_df[['latitude', 'longitude']] = meaningful_df[['latitude', 'longitude']].round(4)
meaningful_df = meaningful_df.drop_duplicates(['latitude', 'longitude'], keep='first', ignore_index=True)

return meaningful_df

def haversine(self, lat1, lon1, lat2, lon2):
R = 6371000.0 # 지구 반지름 (미터)
lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1

a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
distance = R * c
return distance

def calculate_bearing(self, lat1, lon1, lat2, lon2):
lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
dlon = lon2 - lon1
x = math.sin(dlon) * math.cos(lat2)
y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon)
initial_bearing = math.atan2(x, y)
initial_bearing = math.degrees(initial_bearing)
compass_bearing = (initial_bearing + 360) % 360
return compass_bearing

def add_movement_features(self):
self.df['next_lat'] = self.df['latitude'].shift(1)
self.df['next_lon'] = self.df['longitude'].shift(1)

self.df['distance'] = self.df.apply(lambda row: self.haversine(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)
self.df['bearing'] = self.df.apply(lambda row: self.calculate_bearing(row['latitude'], row['longitude'], row['next_lat'], row['next_lon']) if pd.notna(row['next_lat']) else None, axis=1)

if __name__ == '__main__':
# 파일 경로 가져오기
filePath = r"C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20081202160051.txt"
la = LocationAnalyzer(filePath)
self.df = self.df.drop(['next_lat', 'next_lon'], axis=1)
self.df = self.df.fillna(0)

def map_to_meaningful_places(self, meaningful_df):
y, m_hour_block, m_day_of_week = [], [], []
for i in range(len(self.df)):
current_location = (self.df['latitude'].iloc[i], self.df['longitude'].iloc[i])
min_distance = float('inf')

for j in range(len(meaningful_df)):
place_location = (meaningful_df['latitude'].iloc[j], meaningful_df['longitude'].iloc[j])
dist = distance.euclidean(current_location, place_location)
if dist < min_distance:
min_distance = dist
min_distance_index = j

y.append(min_distance_index)
m_hour_block.append(meaningful_df['hour_block'].iloc[min_distance_index])
m_day_of_week.append(meaningful_df['day_of_week'].iloc[min_distance_index])

self.df['m_hour_block'] = m_hour_block
self.df['m_day_of_week'] = m_day_of_week
self.df['y'] = y

def calculate_additional_features(self):
self.df['speed'] = self.df['distance'] / (self.df['hour_block'] / 4 + 1)
self.df['lat_rate_change'] = self.df['latitude'].diff() / self.df['hour_block'].diff().replace(0, 1)
self.df['lon_rate_change'] = self.df['longitude'].diff() / self.df['hour_block'].diff().replace(0, 1)

daily_variability = self.df.groupby('day_of_week')[['latitude', 'longitude']].std().add_suffix('_daily_var')
hourly_variability = self.df.groupby('hour_block')[['latitude', 'longitude']].std().add_suffix('_hourly_var')
self.df = self.df.join(daily_variability, on='day_of_week')
self.df = self.df.join(hourly_variability, on='hour_block')

data = la.gmeansFunc()
self.df['max_travel_range'] = self.df.groupby('hour_block')['distance'].transform('max')

self.df['movement_direction'] = self.df['bearing'].apply(lambda x: 0 if x < 180 else 1)
self.df = self.df.fillna(0)


def run_analysis(self):
meaningful_df = self.gmeans_func()
self.add_movement_features()
self.map_to_meaningful_places(meaningful_df)
self.calculate_additional_features()

y = self.df['y']
self.df = self.df.drop(['y'], axis=1)
self.df['y'] = y

return self.df, meaningful_df



if __name__ == '__main__':

# csv 파일 가져오기
# 필요한 데이터 'date', 'time', 'latitude', 'longitude', 'user_status' : 날짜 시간 위도 경도 이동상태
csv_path = r"C:\Users\sk002\Downloads\138362.csv"
la = LocationAnalyzer(csv_path)

df, meaningful_df = la.run_analysis()

print(data[1])
print(data[1][0])
print(data[1][0][0]) # latitude
print(data[1][0][1]) # longitude
print(data[1][2]) # time
print(data[1][3]) # w

print(type(data[1][2]))
print(type(data[1][3]))
print(df)
print(meaningful_df)
Loading

0 comments on commit ef98c7b

Please sign in to comment.