Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More precise newsflash coordinates (using Waze data) #1552

Draft
wants to merge 10 commits into
base: dev
Choose a base branch
from
2 changes: 2 additions & 0 deletions anyway/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
short_distance_resolutions = ["צומת עירוני", "צומת בינעירוני", "רחוב"]
long_distance_resolutions = ["עיר", "נפה", "מחוז", "כביש בינעירוני"]
resolution_dict = {
"מחוז": ["region_hebrew"],
"נפה": ["district_hebrew"],
Expand Down
30 changes: 3 additions & 27 deletions anyway/parsers/injured_around_schools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import shutil
from datetime import datetime

import math
import pandas as pd
from sqlalchemy import or_, not_, and_

Expand All @@ -17,6 +16,7 @@
InjuredAroundSchoolAllData,
)
from anyway.utilities import time_delta, chunks
from anyway.parsers.utils import get_bounding_box_polygon
from anyway.app_and_db import db

SUBTYPE_ACCIDENT_WITH_PEDESTRIAN = 1
Expand All @@ -32,37 +32,13 @@
DATE_URL_FORMAT = "%Y-%m-%d"


def get_bounding_box(latitude, longitude, distance_in_km):
latitude = math.radians(latitude)
longitude = math.radians(longitude)

radius = 6371
# Radius of the parallel at given latitude
parallel_radius = radius * math.cos(latitude)

lat_min = latitude - distance_in_km / radius
lat_max = latitude + distance_in_km / radius
lon_min = longitude - distance_in_km / parallel_radius
lon_max = longitude + distance_in_km / parallel_radius
rad2deg = math.degrees

return rad2deg(lat_min), rad2deg(lon_min), rad2deg(lat_max), rad2deg(lon_max)


def acc_inv_query(longitude, latitude, distance, start_date, end_date, school):
lat_min, lon_min, lat_max, lon_max = get_bounding_box(latitude, longitude, distance)
baseX = lon_min
baseY = lat_min
distanceX = lon_max
distanceY = lat_max
pol_str = "POLYGON(({0} {1},{0} {3},{2} {3},{2} {1},{0} {1}))".format(
baseX, baseY, distanceX, distanceY
)
polygon_str = get_bounding_box_polygon(latitude, longitude, distance)

query_obj = (
db.session.query(Involved, AccidentMarker)
.join(AccidentMarker, AccidentMarker.provider_and_id == Involved.provider_and_id)
.filter(AccidentMarker.geom.intersects(pol_str))
.filter(AccidentMarker.geom.intersects(polygon_str))
.filter(Involved.injured_type == INJURED_TYPE_PEDESTRIAN)
.filter(AccidentMarker.provider_and_id == Involved.provider_and_id)
.filter(
Expand Down
55 changes: 52 additions & 3 deletions anyway/parsers/location_extraction.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime, timedelta
import logging
import re

Expand All @@ -6,10 +7,13 @@
import numpy as np
from geographiclib.geodesic import Geodesic

from anyway.models import NewsFlash
from anyway.parsers import resolution_dict
from anyway.models import NewsFlash, WazeAlert
from anyway.parsers import resolution_dict, short_distance_resolutions, long_distance_resolutions
from anyway.parsers.utils import get_bounding_box_polygon
from anyway import secrets

WAZE_ALERT_NEWSFLASH_DELTA_IN_HOURS = 3


def extract_road_number(location):
"""
Expand Down Expand Up @@ -297,15 +301,60 @@ def extract_location_text(text):
return text


def get_related_waze_accident_alert(db, geo_location, newsflash):

# determine what distance (in kilometers) to look for waze accidents in, according to the newsflash's resolution
if newsflash.resolution in short_distance_resolutions:
distance = 0.3
elif newsflash.resolution in long_distance_resolutions:
distance = 5
else:

# unknown resolution - skip this optimization
return None
elazarg marked this conversation as resolved.
Show resolved Hide resolved

# create the bounding box according to the coordinate we have, and the resolution distance
bounding_box_polygon_str = get_bounding_box_polygon(
geo_location["lat"], geo_location["lon"], distance
)

# find waze alerts in that bounding box, from the recent time delta - and return the first as the related waze alert
matching_alert = (
db.session.query(WazeAlert)
.filter(WazeAlert.alert_type == "ACCIDENT")
.filter(
WazeAlert.created_at.between(
newsflash.date - timedelta(hours=WAZE_ALERT_NEWSFLASH_DELTA_IN_HOURS),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this constant is only used inside the function, it's should probably be local constant. This way the occasional reader doesn't have to wonder where else it is used.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, it can be a timedelta, making it slightly more type-safe, and making this expression a single line.

datetime.now(),
)
)
.filter(WazeAlert.geom.intersects(bounding_box_polygon_str))
.first()
)

return matching_alert


def extract_geo_features(db, newsflash: NewsFlash) -> None:
newsflash.location = extract_location_text(newsflash.description) or extract_location_text(
newsflash.title
)
geo_location = geocode_extract(newsflash.location)
if geo_location is not None:
newsflash.resolution = set_accident_resolution(geo_location)

newsflash.lat = geo_location["geom"]["lat"]
newsflash.lon = geo_location["geom"]["lng"]
newsflash.resolution = set_accident_resolution(geo_location)

# improve location using waze
related_waze_accident = get_related_waze_accident_alert(db, geo_location, newsflash)
if related_waze_accident:
newsflash.waze_alert = related_waze_accident.id

# TODO: uncomment this after testing the related waze accidents mechanism is working properly on real data
# newsflash.lat = related_waze_accident.latitude
# newsflash.lon = related_waze_accident.longitude

location_from_db = get_db_matching_location(
db,
newsflash.lat,
Expand Down
27 changes: 27 additions & 0 deletions anyway/parsers/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import math


def batch_iterator(iterable, batch_size):
iterator = iter(iterable)
iteration_stopped = False
Expand All @@ -14,3 +17,27 @@ def batch_iterator(iterable, batch_size):
yield batch
if iteration_stopped:
break


def get_bounding_box_polygon(latitude, longitude, distance_in_km):
latitude = math.radians(latitude)
longitude = math.radians(longitude)

radius = 6371
# Radius of the parallel at given latitude
parallel_radius = radius * math.cos(latitude)

lat_min = latitude - distance_in_km / radius
lat_max = latitude + distance_in_km / radius
lon_min = longitude - distance_in_km / parallel_radius
lon_max = longitude + distance_in_km / parallel_radius
rad2deg = math.degrees

elazarg marked this conversation as resolved.
Show resolved Hide resolved
baseX = rad2deg(lon_min)
baseY = rad2deg(lat_min)
distanceX = rad2deg(lon_max)
distanceY = rad2deg(lat_max)

return "POLYGON(({0} {1},{0} {3},{2} {3},{2} {1},{0} {1}))".format(
baseX, baseY, distanceX, distanceY
)
elazarg marked this conversation as resolved.
Show resolved Hide resolved
93 changes: 91 additions & 2 deletions tests/test_news_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

import pytest

from anyway.app_and_db import db
from anyway.parsers import rss_sites, twitter, location_extraction
from anyway.parsers.news_flash_classifiers import classify_tweets, classify_rss
from anyway import secrets
from anyway.parsers.news_flash_db_adapter import init_db
from anyway.models import NewsFlash
from anyway.parsers import timezones
from anyway.models import NewsFlash, WazeAlert
from anyway.parsers import timezones, short_distance_resolutions, long_distance_resolutions
from anyway.parsers.infographics_data_cache_updater import is_cache_eligible, is_in_cache


Expand Down Expand Up @@ -232,6 +233,57 @@ def test_extract_location_text():
assert expected_location_text == actual_location_text


def test_waze_alert():

# create a waze alert
waze_alert = _create_waze_accident_alert()

try:
newsflash = NewsFlash(date=datetime.datetime.now())

# set the geo_location to be close to the waze accident alert location
geo_location = {
"lon": waze_alert.longitude + 0.001,
"lat": waze_alert.latitude + 0.0001,
}

# check that we successfully get the related waze accident event
for resolution in short_distance_resolutions:
newsflash.resolution = resolution
related_waze_accident_alert = location_extraction.get_related_waze_accident_alert(db,
geo_location,
newsflash)

assert waze_alert == related_waze_accident_alert

# set geo_location to a further location
geo_location = {
"lon": waze_alert.longitude + 0.01,
"lat": waze_alert.latitude + 0.0001,
}

# make sure short_distance_resolutions *do not* get any waze accident alert
for resolution in short_distance_resolutions:
newsflash.resolution = resolution
related_waze_accident_alert = location_extraction.get_related_waze_accident_alert(db,
geo_location,
newsflash)

assert related_waze_accident_alert is None

# make sure we successfully get the related waze accident for long_distance_resolutions
for resolution in long_distance_resolutions:
newsflash.resolution = resolution
related_waze_accident_alert = location_extraction.get_related_waze_accident_alert(db,
geo_location,
newsflash)

assert waze_alert == related_waze_accident_alert

finally:
_delete_waze_alert(waze_alert.id)


def test_timeparse():
twitter = timezones.parse_creation_datetime("Sun May 31 08:26:18 +0000 2020")
ynet = timezones.parse_creation_datetime("Sun, 31 May 2020 11:26:18 +0300")
Expand Down Expand Up @@ -268,3 +320,40 @@ def test_classification_statistics_ynet():
assert precision > BEST_PRECISION_YNET
assert recall > BEST_RECALL_YNET
assert f1 > BEST_F1_YNET


elazarg marked this conversation as resolved.
Show resolved Hide resolved
def _create_waze_accident_alert():
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that you're using the context manager, there's no need to write these two functions separately. Just inline them.

id = db.session.query(WazeAlert).count() + 1,

longitude, latitude = (
float(31.0),
float(34.0),
)
point_str = "POINT({0} {1})".format(longitude, latitude)

waze_alert = WazeAlert(
id=id[0],
city='באר שבע',
confidence=2,
created_at=datetime.datetime.now(),
longitude=longitude,
latitude=latitude,
magvar=190,
number_thumbs_up=1,
report_rating=5,
reliability=10,
alert_type='ACCIDENT',
alert_subtype='',
street='דרך מצדה',
road_type=3,
geom=point_str,
)
db.session.add(waze_alert)
db.session.commit()

return waze_alert


def _delete_waze_alert(waze_alert_id):
db.session.query(WazeAlert).filter_by(id=waze_alert_id).delete()
db.session.commit()