feat: fixes after testing with noisier data (#2)

* print a warning to console when invalid coordinates are present in dataset.
* rename cli to __main__ to allow calling as a python module
* move data validation to `io` module
This commit is contained in:
Miguel Sozinho Ramalho
2022-06-30 12:13:02 +01:00
committed by Felix Spöttel
parent 50f8a872e6
commit 729c2e49bf
4 changed files with 36 additions and 4 deletions

View File

@@ -50,5 +50,5 @@ def cluster_locations(df, algorithm, radius_km, min_cluster_size):
n_jobs=-1, n_jobs=-1,
) )
X = np.radians(np.array(coordinates)) X = np.radians(np.array(coordinates).astype(float))
return to_cluster_dict(df, clustering.fit(X)) return to_cluster_dict(df, clustering.fit(X))

View File

@@ -64,8 +64,8 @@ class GeoJSONEncoder:
"cluster_id": cluster_id, "cluster_id": cluster_id,
} }
lon = props.pop("lon") lon = float(props.pop("lon"))
lat = props.pop("lat") lat = float(props.pop("lat"))
point = geojson.Point((lon, lat)) point = geojson.Point((lon, lat))
self.state.append(geojson.Feature(geometry=point, properties=props)) self.state.append(geojson.Feature(geometry=point, properties=props))

View File

@@ -4,11 +4,43 @@ from pkg_resources import resource_filename
import json import json
import json import json
import pandas as pd import pandas as pd
import numpy as np
def is_valid_lat(val: str) -> bool:
"""Given a string, check if it corresponds to a valid decimal latitude value"""
try:
val = float(val)
return val >= -90 and val <= 90
except:
return False
def is_valid_lon(val: str) -> bool:
"""Given a string, check if it corresponds to a valid decimal longitude value"""
try:
val = float(val)
return val >= -180 and val <= 180
except:
return False
def read_csv_file(filename): def read_csv_file(filename):
"""Read input csv file, dropping rows that don't have valid location data.""" """Read input csv file, dropping rows that don't have valid location data."""
return pd.read_csv(filename).dropna(subset=["lat", "lon"]) df = pd.read_csv(filename)
initial_rows = len(df)
df = df.dropna(subset=["lat", "lon"])
df.replace({np.nan: None}) # replace for other fields not to break kepler parsing
print(f"Ignored {initial_rows - len(df)} coordinates with NaN")
valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply(
is_valid_lon
)
if len(df_invalid := df[~valid_index]):
print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:")
print(df_invalid[["lat", "lon"]].to_string())
return df[valid_index]
def ensure_file_path(dirname, filename): def ensure_file_path(dirname, filename):