mirror of
https://github.com/bellingcat/geoclustering.git
synced 2026-06-12 21:48:30 +03:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de4d4689b9 | ||
|
|
484d3cb02c | ||
|
|
65366816fa | ||
|
|
de91354867 | ||
|
|
e9a7519168 | ||
|
|
dc7e12642e | ||
|
|
93c51d7a80 | ||
|
|
f77d1d9d62 | ||
|
|
99e844c6ce | ||
|
|
ff094a1d3e |
8
.github/workflows/main.yml
vendored
8
.github/workflows/main.yml
vendored
@@ -36,17 +36,19 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
python setup.py check
|
python setup.py check
|
||||||
python setup.py bdist_wheel sdist
|
python setup.py bdist_wheel sdist
|
||||||
|
|
||||||
- python: "3.10"
|
- python: "3.10"
|
||||||
task:
|
task:
|
||||||
name: "Style"
|
name: "Lint"
|
||||||
run: |
|
run: |
|
||||||
black --check .
|
black --check .
|
||||||
- python: "3.10"
|
- python: "3.10"
|
||||||
task:
|
task:
|
||||||
name: "Test"
|
name: "Test"
|
||||||
run: pytest --exitfirst --failed-first
|
run: pytest --exitfirst --failed-first
|
||||||
|
- python: "3.7"
|
||||||
|
task:
|
||||||
|
name: "Test (3.7)"
|
||||||
|
run: pytest --exitfirst --failed-first
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
|||||||
1
Pipfile
1
Pipfile
@@ -16,6 +16,7 @@ black = "*"
|
|||||||
pre-commit = "*"
|
pre-commit = "*"
|
||||||
pytest = "*"
|
pytest = "*"
|
||||||
wheel = "*"
|
wheel = "*"
|
||||||
|
geoclustering = {editable = true, path = "."}
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.9"
|
python_version = "3.9"
|
||||||
|
|||||||
1163
Pipfile.lock
generated
1163
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
21
README.md
21
README.md
@@ -72,7 +72,7 @@ id,name,lat,lon
|
|||||||
|
|
||||||
## Output
|
## Output
|
||||||
|
|
||||||
If at least one cluster was found, the tool outputs a folder with output as `json`, `geojson`, `txt` files. A kepler.gl `html` file is generated as well.
|
If at least one cluster was found, the tool outputs a folder with output as `json`, `geojson`, `txt`, `csv` files. A kepler.gl `html` file is generated as well.
|
||||||
|
|
||||||
### JSON
|
### JSON
|
||||||
|
|
||||||
@@ -132,6 +132,16 @@ id 9, name Rosanna Foggo, lat -6.2074293, lon 106.8915948
|
|||||||
// ...
|
// ...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### CSV
|
||||||
|
|
||||||
|
Encodes each event in one line with `cluster_id` information associated.
|
||||||
|
|
||||||
|
```csv
|
||||||
|
cluster_id,name,lat,lon
|
||||||
|
9,Rosanna Foggo,-6.2074293,106.8915948
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
### kepler.gl
|
### kepler.gl
|
||||||
|
|
||||||

|

|
||||||
@@ -142,10 +152,19 @@ It is assumed that you are using **Python3.9+**. It is encouraged to [setup a vi
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
# install dependencies & dev-dependencies
|
# install dependencies & dev-dependencies
|
||||||
|
# PIP
|
||||||
pip install -e .[dev,full]
|
pip install -e .[dev,full]
|
||||||
|
# PIPENV
|
||||||
|
pipenv install --dev -e .
|
||||||
|
|
||||||
# install a git hook that runs the code formatter before each commit.
|
# install a git hook that runs the code formatter before each commit.
|
||||||
pre-commit install
|
pre-commit install
|
||||||
```
|
```
|
||||||
|
|
||||||
We use [Black](https://github.com/psf/black) as our code formatter. If you don't want to use the `pre-commit` hook, you can run the formatter manually or via an editor plugin.
|
We use [Black](https://github.com/psf/black) as our code formatter. If you don't want to use the `pre-commit` hook, you can run the formatter manually or via an editor plugin.
|
||||||
|
|
||||||
|
## Release
|
||||||
|
|
||||||
|
1. Update [version.py](geoclustering/version.py)
|
||||||
|
2. Run `scripts/release.sh`
|
||||||
|
3. Confirm GH action completed successfully
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import click
|
import click
|
||||||
import os
|
|
||||||
import webbrowser
|
import webbrowser
|
||||||
|
|
||||||
import geoclustering.clustering as clustering
|
import geoclustering.clustering as clustering
|
||||||
@@ -44,12 +43,13 @@ import geoclustering.io as io
|
|||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--open",
|
"--open",
|
||||||
|
"_open",
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="Open the generated visualization in the default browser automatically.",
|
help="Open the generated visualization in the default browser automatically.",
|
||||||
)
|
)
|
||||||
@click.option("--debug", is_flag=True, help="Print debug output.")
|
@click.option("--debug", is_flag=True, help="Print debug output.")
|
||||||
@click.argument("filename", type=click.Path(exists=True))
|
@click.argument("filename", type=click.Path(exists=True))
|
||||||
def main(distance, size, output, filename, algorithm, open, debug):
|
def main(distance, size, output, filename, algorithm, _open, debug):
|
||||||
def print_debug(s):
|
def print_debug(s):
|
||||||
if debug:
|
if debug:
|
||||||
click.secho(s, fg="bright_black")
|
click.secho(s, fg="bright_black")
|
||||||
@@ -68,21 +68,21 @@ def main(distance, size, output, filename, algorithm, open, debug):
|
|||||||
print_debug(f"Found {len(clusters)} valid clusters using {algorithm}")
|
print_debug(f"Found {len(clusters)} valid clusters using {algorithm}")
|
||||||
|
|
||||||
encoded = encoding.encode_clusters(clusters)
|
encoded = encoding.encode_clusters(clusters)
|
||||||
|
|
||||||
io.write_output_file(output, "result.txt", encoded["string"])
|
io.write_output_file(output, "result.txt", encoded["string"])
|
||||||
io.write_output_file(output, "result.json", encoded["json"])
|
io.write_output_file(output, "result.json", encoded["json"])
|
||||||
io.write_output_file(output, "result.geojson", encoded["geojson"])
|
io.write_output_file(output, "result.geojson", encoded["geojson"])
|
||||||
|
io.write_output_file(output, "result.csv", encoded["csv"])
|
||||||
|
|
||||||
vis = io.write_visualization(output, "result.html", encoded["geojson"])
|
vis = io.write_visualization(output, "result.html", encoded["geojson"])
|
||||||
if vis is None:
|
if vis is None:
|
||||||
print_debug(f"Skipped generating visualization: kepler is not installed.")
|
print_debug("Skipped generating visualization: kepler is not installed.")
|
||||||
|
|
||||||
click.echo(f"Output files saved to {Path(output).absolute()}")
|
click.echo(f"Output files saved to {Path(output).absolute()}")
|
||||||
|
|
||||||
if open:
|
if _open:
|
||||||
if vis:
|
if vis:
|
||||||
webbrowser.open_new_tab("file://" + str(vis.absolute()))
|
webbrowser.open_new_tab("file://" + str(vis.absolute()))
|
||||||
print_debug(f"Opened visualization in default browser.")
|
print_debug("Opened visualization in default browser.")
|
||||||
else:
|
else:
|
||||||
click.secho(
|
click.secho(
|
||||||
"Can't open kepler.gl: package not installed. Please re-install geoclustering with `pip install geoclustering[full]`.",
|
"Can't open kepler.gl: package not installed. Please re-install geoclustering with `pip install geoclustering[full]`.",
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import geojson
|
import geojson
|
||||||
|
import csv
|
||||||
|
import io # not io.py
|
||||||
|
|
||||||
|
|
||||||
class NpEncoder(json.JSONEncoder):
|
class NpEncoder(json.JSONEncoder):
|
||||||
@@ -47,7 +49,7 @@ class JSONEncoder:
|
|||||||
|
|
||||||
for record in cluster:
|
for record in cluster:
|
||||||
cluster_data["points"].append(record)
|
cluster_data["points"].append(record)
|
||||||
self.state.append(cluster_data)
|
self.state.append(cluster_data)
|
||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
return json.dumps(self.state, cls=NpEncoder)
|
return json.dumps(self.state, cls=NpEncoder)
|
||||||
@@ -74,13 +76,37 @@ class GeoJSONEncoder:
|
|||||||
return json.dumps(geojson.FeatureCollection(self.state), cls=NpEncoder)
|
return json.dumps(geojson.FeatureCollection(self.state), cls=NpEncoder)
|
||||||
|
|
||||||
|
|
||||||
|
class CSVEncoder:
|
||||||
|
"""Encodes clustering result as a CSV"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.state = io.StringIO()
|
||||||
|
self.writer = False
|
||||||
|
|
||||||
|
def visitor(self, cluster_id, cluster):
|
||||||
|
if not self.writer:
|
||||||
|
self.writer = csv.DictWriter(
|
||||||
|
self.state,
|
||||||
|
fieldnames=["cluster_id"] + list(cluster[0].keys()),
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
lineterminator="\n",
|
||||||
|
)
|
||||||
|
self.writer.writeheader()
|
||||||
|
|
||||||
|
for record in cluster:
|
||||||
|
self.writer.writerow({**record, "cluster_id": cluster_id})
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self.state.getvalue()
|
||||||
|
|
||||||
|
|
||||||
def encode_clusters(clusters):
|
def encode_clusters(clusters):
|
||||||
json_encoder = JSONEncoder()
|
json_encoder = JSONEncoder()
|
||||||
geojson_encoder = GeoJSONEncoder()
|
geojson_encoder = GeoJSONEncoder()
|
||||||
string_encoder = StringEncoder()
|
string_encoder = StringEncoder()
|
||||||
|
csv_encoder = CSVEncoder()
|
||||||
|
|
||||||
encoders = [json_encoder, geojson_encoder, string_encoder]
|
encoders = [json_encoder, geojson_encoder, string_encoder, csv_encoder]
|
||||||
|
|
||||||
for cluster_id, cluster in clusters.items():
|
for cluster_id, cluster in clusters.items():
|
||||||
for encoder in encoders:
|
for encoder in encoders:
|
||||||
encoder.visitor(cluster_id, cluster)
|
encoder.visitor(cluster_id, cluster)
|
||||||
@@ -89,4 +115,5 @@ def encode_clusters(clusters):
|
|||||||
"json": json_encoder.get(),
|
"json": json_encoder.get(),
|
||||||
"geojson": geojson_encoder.get(),
|
"geojson": geojson_encoder.get(),
|
||||||
"string": string_encoder.get(),
|
"string": string_encoder.get(),
|
||||||
|
"csv": csv_encoder.get(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,13 +58,16 @@ def read_csv_file(filename):
|
|||||||
valid_index = df.lat.apply(is_valid_lat) & df.lon.apply(is_valid_lon)
|
valid_index = df.lat.apply(is_valid_lat) & df.lon.apply(is_valid_lon)
|
||||||
df_invalid = df[~valid_index]
|
df_invalid = df[~valid_index]
|
||||||
|
|
||||||
if count_invalid := len(df_invalid):
|
count_invalid = len(df_invalid)
|
||||||
|
if count_invalid:
|
||||||
df_not_empty = df_invalid[
|
df_not_empty = df_invalid[
|
||||||
(df_invalid.lat.apply(is_not_none) | df_invalid.lon.apply(is_not_none))
|
(df_invalid.lat.apply(is_not_none) | df_invalid.lon.apply(is_not_none))
|
||||||
]
|
]
|
||||||
count_not_empty = len(df_not_empty)
|
|
||||||
|
|
||||||
if count_empty := count_invalid - count_not_empty:
|
count_not_empty = len(df_not_empty)
|
||||||
|
count_empty = count_invalid - count_not_empty
|
||||||
|
|
||||||
|
if count_empty:
|
||||||
print(f"Removed {count_empty} empty coordinate pairs.")
|
print(f"Removed {count_empty} empty coordinate pairs.")
|
||||||
|
|
||||||
if count_not_empty:
|
if count_not_empty:
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
_MAJOR = "0"
|
_MAJOR = "0"
|
||||||
_MINOR = "3"
|
_MINOR = "4"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "0"
|
_PATCH = "1"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from geoclustering.clustering import cluster_locations
|
from geoclustering.clustering import cluster_locations
|
||||||
from geoclustering.io import read_csv_file
|
from tests.helpers import read_fixture_csv
|
||||||
from tests.helpers import get_fixture_path, read_fixture_csv
|
|
||||||
|
|
||||||
|
|
||||||
df = read_fixture_csv("clustering.csv")
|
df = read_fixture_csv("clustering.csv")
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
from geoclustering.clustering import cluster_locations
|
|
||||||
from geoclustering.encoding import encode_clusters
|
from geoclustering.encoding import encode_clusters
|
||||||
from tests.helpers import read_fixture_csv, read_fixture_content
|
from tests.helpers import read_fixture_csv, read_fixture_content
|
||||||
|
|
||||||
@@ -28,3 +27,4 @@ def test_encoders():
|
|||||||
assert res["string"] == read_fixture_content("snapshots/result.txt")
|
assert res["string"] == read_fixture_content("snapshots/result.txt")
|
||||||
assert res["json"] == read_fixture_content("snapshots/result.json")
|
assert res["json"] == read_fixture_content("snapshots/result.json")
|
||||||
assert res["geojson"] == read_fixture_content("snapshots/result.geojson")
|
assert res["geojson"] == read_fixture_content("snapshots/result.geojson")
|
||||||
|
assert res["csv"] == read_fixture_content("snapshots/result.csv")
|
||||||
|
|||||||
5
tests/fixtures/snapshots/result.csv
vendored
Normal file
5
tests/fixtures/snapshots/result.csv
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"cluster_id","id","name","lat","lon"
|
||||||
|
0,1,"Alice",52.523955,13.442362
|
||||||
|
0,2,"Bob",52.526659,13.448097
|
||||||
|
1,3,"Carol",52.525626,13.419246
|
||||||
|
1,4,"Dan",52.52443559865125,13.41261723049818
|
||||||
|
2
tests/fixtures/snapshots/result.json
vendored
2
tests/fixtures/snapshots/result.json
vendored
@@ -1 +1 @@
|
|||||||
[{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]
|
[{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]
|
||||||
Reference in New Issue
Block a user