mirror of
https://github.com/bellingcat/geoclustering.git
synced 2026-06-12 05:28:29 +03:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e9b7680263 | ||
|
|
de4d4689b9 | ||
|
|
484d3cb02c | ||
|
|
65366816fa | ||
|
|
de91354867 | ||
|
|
e9a7519168 | ||
|
|
dc7e12642e | ||
|
|
93c51d7a80 | ||
|
|
f77d1d9d62 | ||
|
|
99e844c6ce | ||
|
|
ff094a1d3e | ||
|
|
926aaf73d6 | ||
|
|
6a5cb3c3c3 | ||
|
|
d252c6b8f3 | ||
|
|
1c5d0f649e | ||
|
|
6ed01417c3 | ||
|
|
3cc3c30e03 | ||
|
|
c9d36c6bf3 | ||
|
|
62da0806c7 | ||
|
|
8657bd73ec | ||
|
|
e633665813 | ||
|
|
cff5256d06 | ||
|
|
4dfa08bbbc | ||
|
|
eaa4022b70 | ||
|
|
1cb5541baa | ||
|
|
b40074317c | ||
|
|
f1053953ba |
53
.github/actions/setup-venv/action.yml
vendored
Normal file
53
.github/actions/setup-venv/action.yml
vendored
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
name: Python virtualenv
|
||||||
|
description: Set up a Python virtual environment with caching
|
||||||
|
inputs:
|
||||||
|
python-version:
|
||||||
|
description: The Python version to use
|
||||||
|
required: true
|
||||||
|
cache-prefix:
|
||||||
|
description: Update this to invalidate the cache
|
||||||
|
required: true
|
||||||
|
default: v0
|
||||||
|
runs:
|
||||||
|
using: composite
|
||||||
|
steps:
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ inputs.python-version }}
|
||||||
|
|
||||||
|
- shell: bash
|
||||||
|
run: |
|
||||||
|
# Install prerequisites.
|
||||||
|
pip install --upgrade pip setuptools wheel virtualenv
|
||||||
|
|
||||||
|
- shell: bash
|
||||||
|
run: |
|
||||||
|
# Get the exact Python version to use in the cache key.
|
||||||
|
echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- uses: actions/cache@v2
|
||||||
|
id: virtualenv-cache
|
||||||
|
with:
|
||||||
|
path: .venv
|
||||||
|
key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('Pipfile.lock') }}
|
||||||
|
- if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
# Set up virtual environment without cache hit.
|
||||||
|
test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
|
||||||
|
. .venv/bin/activate
|
||||||
|
pip install -e .[dev]
|
||||||
|
|
||||||
|
- if: steps.virtualenv-cache.outputs.cache-hit == 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
# Set up virtual environment from cache hit.
|
||||||
|
. .venv/bin/activate
|
||||||
|
pip install --no-deps -e .[dev]
|
||||||
|
|
||||||
|
- shell: bash
|
||||||
|
run: |
|
||||||
|
# Show environment info.
|
||||||
|
. .venv/bin/activate
|
||||||
|
echo "✓ Installed $(python --version) virtual environment to $(which python)"
|
||||||
10
.github/workflows/lint.yml
vendored
10
.github/workflows/lint.yml
vendored
@@ -1,10 +0,0 @@
|
|||||||
name: Lint
|
|
||||||
|
|
||||||
on: [push]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
black:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- uses: psf/black@stable
|
|
||||||
119
.github/workflows/main.yml
vendored
Normal file
119
.github/workflows/main.yml
vendored
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
name: Main
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
# on: [push]
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
tags:
|
||||||
|
- "v*.*.*"
|
||||||
|
|
||||||
|
env:
|
||||||
|
# Change this to invalidate existing cache.
|
||||||
|
CACHE_PREFIX: v0
|
||||||
|
PYTHONPATH: ./
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
checks:
|
||||||
|
name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
|
||||||
|
runs-on: [ubuntu-latest]
|
||||||
|
timeout-minutes: 15
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- python: "3.10"
|
||||||
|
task:
|
||||||
|
name: "Build"
|
||||||
|
run: |
|
||||||
|
python setup.py check
|
||||||
|
python setup.py bdist_wheel sdist
|
||||||
|
- python: "3.10"
|
||||||
|
task:
|
||||||
|
name: "Lint"
|
||||||
|
run: |
|
||||||
|
black --check .
|
||||||
|
- python: "3.10"
|
||||||
|
task:
|
||||||
|
name: "Test"
|
||||||
|
run: pytest --exitfirst --failed-first --assert=plain
|
||||||
|
- python: "3.8"
|
||||||
|
task:
|
||||||
|
name: "Test (3.8)"
|
||||||
|
run: pytest --exitfirst --failed-first --assert=plain
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Setup Python environment
|
||||||
|
uses: ./.github/actions/setup-venv
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python }}
|
||||||
|
cache-prefix: ${{ env.CACHE_PREFIX }}
|
||||||
|
|
||||||
|
- name: ${{ matrix.task.name }}
|
||||||
|
run: |
|
||||||
|
. .venv/bin/activate
|
||||||
|
${{ matrix.task.run }}
|
||||||
|
|
||||||
|
- name: Upload package distribution files
|
||||||
|
if: matrix.task.name == 'Build'
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: package
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
- name: Clean up
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
. .venv/bin/activate
|
||||||
|
pip uninstall -y geoclustering
|
||||||
|
|
||||||
|
release:
|
||||||
|
name: Release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [checks]
|
||||||
|
if: startsWith(github.ref, 'refs/tags/')
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
|
||||||
|
- name: Install requirements
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip setuptools wheel "twine>=1.11.0"
|
||||||
|
|
||||||
|
- name: Prepare environment
|
||||||
|
run: |
|
||||||
|
echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
|
||||||
|
echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Download package distribution files
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: package
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
- name: Publish package to PyPI
|
||||||
|
run: |
|
||||||
|
twine upload -u '${{ secrets.PYPI_USERNAME }}' -p '${{ secrets.PYPI_PASSWORD }}' dist/*
|
||||||
|
|
||||||
|
- name: Publish GitHub release
|
||||||
|
uses: softprops/action-gh-release@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
# body_path: ${{ github.workspace }}-RELEASE_NOTES.md
|
||||||
|
prerelease: ${{ contains(env.TAG, 'rc') }}
|
||||||
|
files: |
|
||||||
|
dist/*
|
||||||
10
.pre-commit-config.yaml
Normal file
10
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 22.3.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
# It is recommended to specify the latest version of Python
|
||||||
|
# supported by your project here, or alternatively use
|
||||||
|
# pre-commit's default_language_version, see
|
||||||
|
# https://pre-commit.com/#top_level-default_language_version
|
||||||
|
language_version: python3.9
|
||||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
|||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2022, Felix Spöttel
|
Copyright (c) 2022, Stichting Bellingcat
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|||||||
22
Pipfile
Normal file
22
Pipfile
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
[[source]]
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
name = "pypi"
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
click = "*"
|
||||||
|
geojson = "*"
|
||||||
|
keplergl = "*"
|
||||||
|
numpy = "*"
|
||||||
|
pandas = "*"
|
||||||
|
scikit-learn = "*"
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
black = "*"
|
||||||
|
pre-commit = "*"
|
||||||
|
pytest = "*"
|
||||||
|
wheel = "*"
|
||||||
|
geoclustering = {editable = true, path = "."}
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.9"
|
||||||
2344
Pipfile.lock
generated
Normal file
2344
Pipfile.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
70
README.md
70
README.md
@@ -1,4 +1,4 @@
|
|||||||
# geocluster
|
# geoclustering
|
||||||
|
|
||||||
> 📍 command-line tool for clustering geolocations.
|
> 📍 command-line tool for clustering geolocations.
|
||||||
|
|
||||||
@@ -10,34 +10,38 @@
|
|||||||
|
|
||||||
### Clustering Method
|
### Clustering Method
|
||||||
|
|
||||||
A cluster is created when a certain number of points (=> `--size`) each are within a given distance (=> `--distance`) of at least one other point in the cluster.
|
A cluster is created when a certain number of points (defined with `--size`) each are within a given distance (defined with `--distance`) of at least one other point in the cluster.
|
||||||
|
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
Clone the repository:
|
Install with pip:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git clone https://github.com/fspoettel/geocluster
|
# with kepler.gl visualization support
|
||||||
cd geocluster
|
pip install geoclustering[full]
|
||||||
|
|
||||||
|
# only text-based output
|
||||||
|
pip install geoclustering
|
||||||
```
|
```
|
||||||
|
|
||||||
Install keplergl build dependencies:
|
If the `full` install fails, you might need to install kepler.gl build dependencies:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# macos
|
# macos
|
||||||
brew install proj gdal
|
brew install proj gdal
|
||||||
```
|
```
|
||||||
|
|
||||||
Install project with pip:
|
|
||||||
```sh
|
|
||||||
pip install .
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
Usage: geocluster [OPTIONS] FILENAME
|
Usage: geoclustering [OPTIONS] FILENAME
|
||||||
|
|
||||||
|
Tool to cluster geolocations. A cluster is created when a certain number of
|
||||||
|
points (defined with --size) each are within a given distance (defined with
|
||||||
|
--distance) of at least one other point in the cluster. Input is supplied as
|
||||||
|
a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column.
|
||||||
|
Other rows are reflected to the output.
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-d, --distance FLOAT (in km) Max. distance between two points in
|
-d, --distance FLOAT (in km) Max. distance between two points in
|
||||||
@@ -50,12 +54,15 @@ Options:
|
|||||||
Clustering algorithm to be used. `optics`
|
Clustering algorithm to be used. `optics`
|
||||||
produces tighter clusters but is slower.
|
produces tighter clusters but is slower.
|
||||||
Default: dbscan
|
Default: dbscan
|
||||||
|
--open Open the generated visualization in the
|
||||||
|
default browser automatically.
|
||||||
|
--debug Print debug output.
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
## Input
|
## Input
|
||||||
|
|
||||||
Inputs are supplied as a `.csv` file. The only required fields are `lat` and `lon`, all other fields are reflected to the output.
|
Inputs are supplied as a `.csv` file. At a minimum, each row needs to have a `lat` and a `lon`` column. Other rows are reflected to the output.
|
||||||
|
|
||||||
```csv
|
```csv
|
||||||
id,name,lat,lon
|
id,name,lat,lon
|
||||||
@@ -65,7 +72,7 @@ id,name,lat,lon
|
|||||||
|
|
||||||
## Output
|
## Output
|
||||||
|
|
||||||
If at least one cluster was found, the tool outputs a folder with `json`, `geojson`, `text` and a kepler.gl `html` files.
|
If at least one cluster was found, the tool outputs a folder with output as `json`, `geojson`, `txt`, `csv` files. A kepler.gl `html` file is generated as well.
|
||||||
|
|
||||||
### JSON
|
### JSON
|
||||||
|
|
||||||
@@ -114,7 +121,7 @@ Encodes a single `FeatureCollection`, containing all points as `Feature` objects
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### txt
|
### Text
|
||||||
|
|
||||||
Encodes cluster as blocks separated by a newline, where each line in a cluster block contains one point.
|
Encodes cluster as blocks separated by a newline, where each line in a cluster block contains one point.
|
||||||
|
|
||||||
@@ -125,6 +132,39 @@ id 9, name Rosanna Foggo, lat -6.2074293, lon 106.8915948
|
|||||||
// ...
|
// ...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### CSV
|
||||||
|
|
||||||
|
Encodes each event in one line with `cluster_id` information associated.
|
||||||
|
|
||||||
|
```csv
|
||||||
|
cluster_id,name,lat,lon
|
||||||
|
9,Rosanna Foggo,-6.2074293,106.8915948
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
### kepler.gl
|
### kepler.gl
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
## Develop
|
||||||
|
|
||||||
|
It is assumed that you are using **Python3.9+**. It is encouraged to [setup a virtualenv](https://wiki.archlinux.org/title/Python/Virtual_environment#venv>) for development.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# install dependencies & dev-dependencies
|
||||||
|
# PIP
|
||||||
|
pip install -e .[dev,full]
|
||||||
|
# PIPENV
|
||||||
|
pipenv install --dev -e .
|
||||||
|
|
||||||
|
# install a git hook that runs the code formatter before each commit.
|
||||||
|
pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
We use [Black](https://github.com/psf/black) as our code formatter. If you don't want to use the `pre-commit` hook, you can run the formatter manually or via an editor plugin.
|
||||||
|
|
||||||
|
## Release
|
||||||
|
|
||||||
|
1. Update [version.py](geoclustering/version.py)
|
||||||
|
2. Run `scripts/release.sh`
|
||||||
|
3. Confirm GH action completed successfully
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
import click
|
|
||||||
import webbrowser
|
|
||||||
|
|
||||||
import geocluster.clustering as clustering
|
|
||||||
import geocluster.encoding as encoding
|
|
||||||
import geocluster.io as io
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
|
||||||
@click.option(
|
|
||||||
"--distance",
|
|
||||||
"-d",
|
|
||||||
type=click.FLOAT,
|
|
||||||
required=True,
|
|
||||||
help="(in km) Max. distance between two points in a cluster.",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--size",
|
|
||||||
"-s",
|
|
||||||
type=click.INT,
|
|
||||||
required=True,
|
|
||||||
help="Min. number of points in a cluster.",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--output",
|
|
||||||
"-o",
|
|
||||||
type=click.Path(exists=False),
|
|
||||||
default="output",
|
|
||||||
help="Output directory for results. Default: ./output",
|
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"--algorithm",
|
|
||||||
"-a",
|
|
||||||
type=click.Choice(
|
|
||||||
["dbscan", "optics"],
|
|
||||||
case_sensitive=False,
|
|
||||||
),
|
|
||||||
default="dbscan",
|
|
||||||
help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
|
|
||||||
)
|
|
||||||
@click.argument("filename", type=click.Path(exists=True))
|
|
||||||
def main(distance, size, output, filename, algorithm):
|
|
||||||
df = io.read_csv_file(filename)
|
|
||||||
|
|
||||||
clusters = clustering.cluster_locations(
|
|
||||||
df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
|
|
||||||
)
|
|
||||||
|
|
||||||
if not bool(clusters):
|
|
||||||
click.echo("Did not find clusters matching input parameters.")
|
|
||||||
return
|
|
||||||
|
|
||||||
encoded = encoding.encode_clusters(clusters)
|
|
||||||
|
|
||||||
io.write_output_file(output, "result.txt", encoded["string"])
|
|
||||||
io.write_output_file(output, "result.json", encoded["json"])
|
|
||||||
io.write_output_file(output, "result.geojson", encoded["geojson"])
|
|
||||||
vis = io.write_visualization(output, "result.html", encoded["geojson"])
|
|
||||||
|
|
||||||
webbrowser.open_new_tab("file://" + str(vis.absolute()))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,78 +0,0 @@
|
|||||||
from keplergl import KeplerGl
|
|
||||||
from pathlib import Path
|
|
||||||
from pkg_resources import resource_filename
|
|
||||||
import json
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_lat(val: str) -> bool:
|
|
||||||
"""Given a string, check if it corresponds to a valid decimal latitude value"""
|
|
||||||
try:
|
|
||||||
val = float(val)
|
|
||||||
return val >= -90 and val <= 90
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_lon(val: str) -> bool:
|
|
||||||
"""Given a string, check if it corresponds to a valid decimal longitude value"""
|
|
||||||
try:
|
|
||||||
val = float(val)
|
|
||||||
return val >= -180 and val <= 180
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def read_csv_file(filename):
|
|
||||||
"""Read input csv file, dropping rows that don't have valid location data."""
|
|
||||||
df = pd.read_csv(filename)
|
|
||||||
initial_rows = len(df)
|
|
||||||
|
|
||||||
df = df.dropna(subset=["lat", "lon"])
|
|
||||||
df = df.replace(
|
|
||||||
{np.nan: None}
|
|
||||||
) # replace for other fields not to break kepler parsing
|
|
||||||
print(f"Ignored {initial_rows - len(df)} coordinates with NaN")
|
|
||||||
|
|
||||||
valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply(
|
|
||||||
is_valid_lon
|
|
||||||
)
|
|
||||||
if len(df_invalid := df[~valid_index]):
|
|
||||||
print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:")
|
|
||||||
print(df_invalid[["lat", "lon"]].to_string())
|
|
||||||
return df[valid_index]
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_file_path(dirname, filename):
|
|
||||||
"""Ensure a parent directory exists for a file."""
|
|
||||||
path = Path(dirname)
|
|
||||||
path.mkdir(parents=True, exist_ok=True)
|
|
||||||
return path / filename
|
|
||||||
|
|
||||||
|
|
||||||
def write_output_file(dirname, filename, data):
|
|
||||||
"""Write a file, ensuring parent directories."""
|
|
||||||
filepath = ensure_file_path(dirname, filename)
|
|
||||||
|
|
||||||
with open(filepath, "w") as f:
|
|
||||||
f.write(data)
|
|
||||||
|
|
||||||
return filepath
|
|
||||||
|
|
||||||
|
|
||||||
def write_visualization(dirname, filename, data):
|
|
||||||
"""Write a visualization, ensuring parent directories."""
|
|
||||||
map = KeplerGl()
|
|
||||||
map.add_data(data=data, name="clusters")
|
|
||||||
|
|
||||||
# config configures a default color scheme for our clusters layer.
|
|
||||||
config_file = resource_filename("geocluster", "kepler_config.json")
|
|
||||||
with open(config_file) as f:
|
|
||||||
map.config = json.loads(f.read())
|
|
||||||
|
|
||||||
filepath = ensure_file_path(dirname, filename)
|
|
||||||
map.save_to_html(file_name=str(filepath), center_map=True)
|
|
||||||
|
|
||||||
return filepath
|
|
||||||
96
geoclustering/__main__.py
Normal file
96
geoclustering/__main__.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import click
|
||||||
|
import webbrowser
|
||||||
|
|
||||||
|
import geoclustering.clustering as clustering
|
||||||
|
import geoclustering.encoding as encoding
|
||||||
|
import geoclustering.io as io
|
||||||
|
|
||||||
|
|
||||||
|
@click.command(
|
||||||
|
help="Tool to cluster geolocations. A cluster is created when a certain number of points (defined with --size) each are within a given distance (defined with --distance) of at least one other point in the cluster. Input is supplied as a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column. Other rows are reflected to the output."
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--distance",
|
||||||
|
"-d",
|
||||||
|
type=click.FLOAT,
|
||||||
|
required=True,
|
||||||
|
help="(in km) Max. distance between two points in a cluster.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--size",
|
||||||
|
"-s",
|
||||||
|
type=click.INT,
|
||||||
|
required=True,
|
||||||
|
help="Min. number of points in a cluster.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--output",
|
||||||
|
"-o",
|
||||||
|
type=click.Path(exists=False),
|
||||||
|
default="output",
|
||||||
|
help="Output directory for results. Default: ./output",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--algorithm",
|
||||||
|
"-a",
|
||||||
|
type=click.Choice(
|
||||||
|
["dbscan", "optics"],
|
||||||
|
case_sensitive=False,
|
||||||
|
),
|
||||||
|
default="dbscan",
|
||||||
|
help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--open",
|
||||||
|
"_open",
|
||||||
|
is_flag=True,
|
||||||
|
help="Open the generated visualization in the default browser automatically.",
|
||||||
|
)
|
||||||
|
@click.option("--debug", is_flag=True, help="Print debug output.")
|
||||||
|
@click.argument("filename", type=click.Path(exists=True))
|
||||||
|
def main(distance, size, output, filename, algorithm, _open, debug):
|
||||||
|
def print_debug(s):
|
||||||
|
if debug:
|
||||||
|
click.secho(s, fg="bright_black")
|
||||||
|
|
||||||
|
df = io.read_csv_file(filename)
|
||||||
|
print_debug(f"Read {len(df)} valid coordinates from {Path(filename).absolute()}")
|
||||||
|
|
||||||
|
clusters = clustering.cluster_locations(
|
||||||
|
df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
|
||||||
|
)
|
||||||
|
|
||||||
|
if not bool(clusters):
|
||||||
|
click.secho("Did not find clusters matching input parameters.", fg="yellow")
|
||||||
|
return
|
||||||
|
|
||||||
|
print_debug(f"Found {len(clusters)} valid clusters using {algorithm}")
|
||||||
|
|
||||||
|
encoded = encoding.encode_clusters(clusters)
|
||||||
|
io.write_output_file(output, "result.txt", encoded["string"])
|
||||||
|
io.write_output_file(output, "result.json", encoded["json"])
|
||||||
|
io.write_output_file(output, "result.geojson", encoded["geojson"])
|
||||||
|
io.write_output_file(output, "result.csv", encoded["csv"])
|
||||||
|
|
||||||
|
vis = io.write_visualization(output, "result.html", encoded["geojson"])
|
||||||
|
if vis is None:
|
||||||
|
print_debug("Skipped generating visualization: kepler is not installed.")
|
||||||
|
|
||||||
|
click.echo(f"Output files saved to {Path(output).absolute()}")
|
||||||
|
|
||||||
|
if _open:
|
||||||
|
if vis:
|
||||||
|
webbrowser.open_new_tab("file://" + str(vis.absolute()))
|
||||||
|
print_debug("Opened visualization in default browser.")
|
||||||
|
else:
|
||||||
|
click.secho(
|
||||||
|
"Can't open kepler.gl: package not installed. Please re-install geoclustering with `pip install geoclustering[full]`.",
|
||||||
|
fg="yellow",
|
||||||
|
)
|
||||||
|
|
||||||
|
click.secho("Clustering completed.", fg="green")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -14,8 +14,6 @@ def to_cluster_dict(df, clustering):
|
|||||||
"""
|
"""
|
||||||
clusters_by_id = {}
|
clusters_by_id = {}
|
||||||
|
|
||||||
print(clustering.labels_)
|
|
||||||
|
|
||||||
for idx, cluster_id in enumerate(clustering.labels_):
|
for idx, cluster_id in enumerate(clustering.labels_):
|
||||||
# ignore "noise" locations that don't belong to any cluster.
|
# ignore "noise" locations that don't belong to any cluster.
|
||||||
if cluster_id > -1:
|
if cluster_id > -1:
|
||||||
@@ -1,6 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import geojson
|
import geojson
|
||||||
|
import csv
|
||||||
|
import io # not io.py
|
||||||
|
|
||||||
|
|
||||||
class NpEncoder(json.JSONEncoder):
|
class NpEncoder(json.JSONEncoder):
|
||||||
@@ -47,7 +49,7 @@ class JSONEncoder:
|
|||||||
|
|
||||||
for record in cluster:
|
for record in cluster:
|
||||||
cluster_data["points"].append(record)
|
cluster_data["points"].append(record)
|
||||||
self.state.append(cluster_data)
|
self.state.append(cluster_data)
|
||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
return json.dumps(self.state, cls=NpEncoder)
|
return json.dumps(self.state, cls=NpEncoder)
|
||||||
@@ -74,13 +76,37 @@ class GeoJSONEncoder:
|
|||||||
return json.dumps(geojson.FeatureCollection(self.state), cls=NpEncoder)
|
return json.dumps(geojson.FeatureCollection(self.state), cls=NpEncoder)
|
||||||
|
|
||||||
|
|
||||||
|
class CSVEncoder:
|
||||||
|
"""Encodes clustering result as a CSV"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.state = io.StringIO()
|
||||||
|
self.writer = False
|
||||||
|
|
||||||
|
def visitor(self, cluster_id, cluster):
|
||||||
|
if not self.writer:
|
||||||
|
self.writer = csv.DictWriter(
|
||||||
|
self.state,
|
||||||
|
fieldnames=["cluster_id"] + list(cluster[0].keys()),
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
lineterminator="\n",
|
||||||
|
)
|
||||||
|
self.writer.writeheader()
|
||||||
|
|
||||||
|
for record in cluster:
|
||||||
|
self.writer.writerow({**record, "cluster_id": cluster_id})
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self.state.getvalue()
|
||||||
|
|
||||||
|
|
||||||
def encode_clusters(clusters):
|
def encode_clusters(clusters):
|
||||||
json_encoder = JSONEncoder()
|
json_encoder = JSONEncoder()
|
||||||
geojson_encoder = GeoJSONEncoder()
|
geojson_encoder = GeoJSONEncoder()
|
||||||
string_encoder = StringEncoder()
|
string_encoder = StringEncoder()
|
||||||
|
csv_encoder = CSVEncoder()
|
||||||
|
|
||||||
encoders = [json_encoder, geojson_encoder, string_encoder]
|
encoders = [json_encoder, geojson_encoder, string_encoder, csv_encoder]
|
||||||
|
|
||||||
for cluster_id, cluster in clusters.items():
|
for cluster_id, cluster in clusters.items():
|
||||||
for encoder in encoders:
|
for encoder in encoders:
|
||||||
encoder.visitor(cluster_id, cluster)
|
encoder.visitor(cluster_id, cluster)
|
||||||
@@ -89,4 +115,5 @@ def encode_clusters(clusters):
|
|||||||
"json": json_encoder.get(),
|
"json": json_encoder.get(),
|
||||||
"geojson": geojson_encoder.get(),
|
"geojson": geojson_encoder.get(),
|
||||||
"string": string_encoder.get(),
|
"string": string_encoder.get(),
|
||||||
|
"csv": csv_encoder.get(),
|
||||||
}
|
}
|
||||||
120
geoclustering/io.py
Normal file
120
geoclustering/io.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from pkg_resources import resource_filename
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# kepler is optional, check if installed.
|
||||||
|
try:
|
||||||
|
from keplergl import KeplerGl
|
||||||
|
except:
|
||||||
|
has_kepler = False
|
||||||
|
else:
|
||||||
|
has_kepler = True
|
||||||
|
|
||||||
|
|
||||||
|
class HiddenPrints:
|
||||||
|
"""Disables stdout prints for a block of code."""
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self._original_stdout = sys.stdout
|
||||||
|
sys.stdout = open(os.devnull, "w")
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
sys.stdout.close()
|
||||||
|
sys.stdout = self._original_stdout
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_lat(val: str) -> bool:
|
||||||
|
"""Given a string, check if it corresponds to a valid decimal latitude value"""
|
||||||
|
try:
|
||||||
|
val = float(val)
|
||||||
|
return val >= -90 and val <= 90
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_lon(val: str) -> bool:
|
||||||
|
"""Given a string, check if it corresponds to a valid decimal longitude value"""
|
||||||
|
try:
|
||||||
|
val = float(val)
|
||||||
|
return val >= -180 and val <= 180
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_not_none(val: any) -> bool:
|
||||||
|
return val is not None
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv_file(filename):
|
||||||
|
"""Read input csv file, dropping rows that don't have valid location data."""
|
||||||
|
# replace NaN for all fields not to break kepler parsing.
|
||||||
|
df = pd.read_csv(filename).replace({np.nan: None})
|
||||||
|
|
||||||
|
# construct an index of values with valid lat & lon.
|
||||||
|
valid_index = df.lat.apply(is_valid_lat) & df.lon.apply(is_valid_lon)
|
||||||
|
df_invalid = df[~valid_index]
|
||||||
|
|
||||||
|
count_invalid = len(df_invalid)
|
||||||
|
if count_invalid:
|
||||||
|
df_not_empty = df_invalid[
|
||||||
|
(df_invalid.lat.apply(is_not_none) | df_invalid.lon.apply(is_not_none))
|
||||||
|
]
|
||||||
|
|
||||||
|
count_not_empty = len(df_not_empty)
|
||||||
|
count_empty = count_invalid - count_not_empty
|
||||||
|
|
||||||
|
if count_empty:
|
||||||
|
print(f"Removed {count_empty} empty coordinate pairs.")
|
||||||
|
|
||||||
|
if count_not_empty:
|
||||||
|
print(f"Removed {count_not_empty} invalid coordinate pairs:")
|
||||||
|
print(df_not_empty[["lat", "lon"]].to_string())
|
||||||
|
|
||||||
|
return df[valid_index]
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_file_path(dirname, filename):
|
||||||
|
"""Ensure a parent directory exists for a file."""
|
||||||
|
path = Path(dirname)
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
return path / filename
|
||||||
|
|
||||||
|
|
||||||
|
def write_output_file(dirname, filename, data):
|
||||||
|
"""Write a file, ensuring parent directories."""
|
||||||
|
filepath = ensure_file_path(dirname, filename)
|
||||||
|
|
||||||
|
with open(filepath, "w") as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
|
def write_visualization(dirname, filename, data):
|
||||||
|
"""Write a visualization, ensuring parent directories."""
|
||||||
|
|
||||||
|
if not has_kepler:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Hide kepler stdout output.
|
||||||
|
with HiddenPrints():
|
||||||
|
map = KeplerGl()
|
||||||
|
|
||||||
|
map.add_data(data=data, name="clusters")
|
||||||
|
|
||||||
|
# config configures a default color scheme for our clusters layer.
|
||||||
|
config_file = resource_filename("geoclustering", "kepler_config.json")
|
||||||
|
with open(config_file) as f:
|
||||||
|
map.config = json.loads(f.read())
|
||||||
|
|
||||||
|
filepath = ensure_file_path(dirname, filename)
|
||||||
|
|
||||||
|
# Hide kepler stdout output.
|
||||||
|
with HiddenPrints():
|
||||||
|
map.save_to_html(file_name=str(filepath), center_map=True)
|
||||||
|
|
||||||
|
return filepath
|
||||||
@@ -9,7 +9,7 @@
|
|||||||
"config": {
|
"config": {
|
||||||
"dataId": "clusters",
|
"dataId": "clusters",
|
||||||
"label": "clusters",
|
"label": "clusters",
|
||||||
"color": [179, 173, 158],
|
"color": [248, 149, 112],
|
||||||
"highlightColor": [252, 242, 26, 255],
|
"highlightColor": [252, 242, 26, 255],
|
||||||
"columns": { "geojson": "_geojson" },
|
"columns": { "geojson": "_geojson" },
|
||||||
"isVisible": true,
|
"isVisible": true,
|
||||||
@@ -19,16 +19,30 @@
|
|||||||
"thickness": 0.5,
|
"thickness": 0.5,
|
||||||
"strokeColor": null,
|
"strokeColor": null,
|
||||||
"colorRange": {
|
"colorRange": {
|
||||||
"name": "Global Warming",
|
"name": "Uber Viz Qualitative 4",
|
||||||
"type": "sequential",
|
"type": "qualitative",
|
||||||
"category": "Uber",
|
"category": "Uber",
|
||||||
"colors": [
|
"colors": [
|
||||||
"#5A1846",
|
"#12939A",
|
||||||
"#900C3F",
|
"#DDB27C",
|
||||||
"#C70039",
|
"#88572C",
|
||||||
"#E3611C",
|
"#FF991F",
|
||||||
"#F1920E",
|
"#F15C17",
|
||||||
"#FFC300"
|
"#223F9A",
|
||||||
|
"#DA70BF",
|
||||||
|
"#125C77",
|
||||||
|
"#4DC19C",
|
||||||
|
"#776E57",
|
||||||
|
"#17B8BE",
|
||||||
|
"#F6D18A",
|
||||||
|
"#B7885E",
|
||||||
|
"#FFCB99",
|
||||||
|
"#F89570",
|
||||||
|
"#829AE3",
|
||||||
|
"#E79FD5",
|
||||||
|
"#1E96BE",
|
||||||
|
"#89DAC1",
|
||||||
|
"#B3AD9E"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"strokeColorRange": {
|
"strokeColorRange": {
|
||||||
11
geoclustering/version.py
Normal file
11
geoclustering/version.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
_MAJOR = "0"
|
||||||
|
_MINOR = "4"
|
||||||
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
|
# released build.
|
||||||
|
_PATCH = "1"
|
||||||
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
|
_SUFFIX = ""
|
||||||
|
|
||||||
|
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
|
||||||
|
VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
|
||||||
3
pytest.ini
Normal file
3
pytest.ini
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
[pytest]
|
||||||
|
testpaths = tests/
|
||||||
|
python_files = *.py
|
||||||
18
scripts/release.sh
Executable file
18
scripts/release.sh
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
TAG=$(python -c 'from geoclustering.version import VERSION; print("v" + VERSION)')
|
||||||
|
|
||||||
|
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
|
||||||
|
|
||||||
|
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
|
||||||
|
git add -A
|
||||||
|
git commit -m "Bump version to $TAG for release" || true && git push
|
||||||
|
echo "Creating new git tag $TAG"
|
||||||
|
git tag "$TAG" -m "$TAG"
|
||||||
|
git push --tags
|
||||||
|
else
|
||||||
|
echo "Cancelled"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
34
setup.py
34
setup.py
@@ -1,21 +1,41 @@
|
|||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
|
# version.py defines the VERSION and VERSION_SHORT variables.
|
||||||
|
# We use exec here so we don't import cached_path whilst setting up.
|
||||||
|
VERSION = {} # type: ignore
|
||||||
|
with open("geoclustering/version.py", "r") as version_file:
|
||||||
|
exec(version_file.read(), VERSION)
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="geocluster",
|
name="geoclustering",
|
||||||
version="0.1",
|
version=VERSION["VERSION"],
|
||||||
description="",
|
description="📍 command-line tool for clustering geolocations.",
|
||||||
|
long_description=open("README.md").read(),
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
classifiers=[
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
],
|
||||||
author="Bellingcat",
|
author="Bellingcat",
|
||||||
packages=["geocluster"],
|
author_email="tech@bellingcat.com",
|
||||||
entry_points={"console_scripts": ["geocluster = geocluster.__main__:main"]},
|
license="MIT",
|
||||||
|
packages=["geoclustering"],
|
||||||
|
package_data={"geoclustering": ["kepler_config.json"]},
|
||||||
|
keywords=["cluster", "gis", "pattern-analysis"],
|
||||||
|
entry_points={"console_scripts": ["geoclustering = geoclustering.__main__:main"]},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"click",
|
"click",
|
||||||
"geojson",
|
"geojson",
|
||||||
"keplergl",
|
|
||||||
"numpy",
|
"numpy",
|
||||||
"pandas",
|
"pandas",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
],
|
],
|
||||||
extras_require={"dev": ["black", "wheel"]},
|
extras_require={
|
||||||
|
"dev": ["black", "wheel", "pre-commit", "pytest"],
|
||||||
|
"full": ["keplergl"],
|
||||||
|
},
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
|||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
41
tests/clustering.py
Normal file
41
tests/clustering.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from geoclustering.clustering import cluster_locations
|
||||||
|
from tests.helpers import read_fixture_csv
|
||||||
|
|
||||||
|
|
||||||
|
df = read_fixture_csv("clustering.csv")
|
||||||
|
|
||||||
|
|
||||||
|
def has_member(list, name):
|
||||||
|
return any(x for x in list if x["name"] == name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_clustering_all():
|
||||||
|
# there should be one cluster with all members but Erin.
|
||||||
|
res = cluster_locations(
|
||||||
|
df=df, algorithm="dbscan", radius_km=1.97, min_cluster_size=4
|
||||||
|
)
|
||||||
|
assert len(res.values()) == 1
|
||||||
|
assert len(res[0]) == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_clustering_split():
|
||||||
|
res = cluster_locations(
|
||||||
|
df=df, algorithm="dbscan", radius_km=0.5, min_cluster_size=2
|
||||||
|
)
|
||||||
|
# there should be two cluster: Alice & Bob and Carol & Dan
|
||||||
|
assert len(res.values()) == 2
|
||||||
|
cluster_one = res[0]
|
||||||
|
cluster_two = res[1]
|
||||||
|
assert len(cluster_one) == 2
|
||||||
|
assert has_member(cluster_one, "Alice")
|
||||||
|
assert has_member(cluster_one, "Bob")
|
||||||
|
assert has_member(cluster_two, "Carol")
|
||||||
|
assert has_member(cluster_two, "Dan")
|
||||||
|
|
||||||
|
|
||||||
|
def test_clustering_none():
|
||||||
|
# there should be no clusters now.
|
||||||
|
res = cluster_locations(
|
||||||
|
df=df, algorithm="dbscan", radius_km=0.5, min_cluster_size=3
|
||||||
|
)
|
||||||
|
assert len(res.values()) == 0
|
||||||
30
tests/encoding.py
Normal file
30
tests/encoding.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
from geoclustering.encoding import encode_clusters
|
||||||
|
from tests.helpers import read_fixture_csv, read_fixture_content
|
||||||
|
|
||||||
|
|
||||||
|
df = read_fixture_csv("clustering.csv")
|
||||||
|
|
||||||
|
|
||||||
|
def test_encoders():
|
||||||
|
clusters = {
|
||||||
|
0: [
|
||||||
|
{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362},
|
||||||
|
{"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097},
|
||||||
|
],
|
||||||
|
1: [
|
||||||
|
{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"name": "Dan",
|
||||||
|
"lat": 52.52443559865125,
|
||||||
|
"lon": 13.41261723049818,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
res = encode_clusters(clusters)
|
||||||
|
|
||||||
|
assert res["string"] == read_fixture_content("snapshots/result.txt")
|
||||||
|
assert res["json"] == read_fixture_content("snapshots/result.json")
|
||||||
|
assert res["geojson"] == read_fixture_content("snapshots/result.geojson")
|
||||||
|
assert res["csv"] == read_fixture_content("snapshots/result.csv")
|
||||||
6
tests/fixtures/clustering.csv
vendored
Normal file
6
tests/fixtures/clustering.csv
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
id,name,lat,lon
|
||||||
|
1,Alice,52.523955,13.442362
|
||||||
|
2,Bob,52.526659,13.448097
|
||||||
|
3,Carol,52.525626,13.419246
|
||||||
|
4,Dan,52.52443559865125,13.41261723049818
|
||||||
|
5,Erin,52.524838991760774,13.383188597040382
|
||||||
|
9
tests/fixtures/io.csv
vendored
Normal file
9
tests/fixtures/io.csv
vendored
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
id,name,lat,lon
|
||||||
|
1,Alice,,
|
||||||
|
2,,52.523955,13.442362
|
||||||
|
,,-90.12,132.23
|
||||||
|
4,,78.234,-180.1212
|
||||||
|
5,Bob,52.524838991760774,13.383188597040382
|
||||||
|
6,Peter,91.234,
|
||||||
|
7,Horst,,23.23
|
||||||
|
7,Erin,foo,bar
|
||||||
|
2002
tests/fixtures/mock1000.csv
vendored
2002
tests/fixtures/mock1000.csv
vendored
File diff suppressed because it is too large
Load Diff
102
tests/fixtures/mock50.csv
vendored
102
tests/fixtures/mock50.csv
vendored
@@ -1,51 +1,51 @@
|
|||||||
id,name,lat,lon
|
id,name,lat,lon
|
||||||
1,Bonnibelle Mathwen,40.1324085,64.4911086
|
1,Bonnibelle Mathwen,40.1324085,64.4911086
|
||||||
2,Fayette Elt,49.6235379,6.2379992
|
2,Fayette Elt,49.6235379,6.2379992
|
||||||
3,Jandy Cooch,-7.5874497,110.7420464
|
3,Jandy Cooch,-7.5874497,110.7420464
|
||||||
4,Robb Gerbel,22.2455315,-80.3936994
|
4,Robb Gerbel,22.2455315,-80.3936994
|
||||||
5,Silvie Clipson,40.3418956,21.5118754
|
5,Silvie Clipson,40.3418956,21.5118754
|
||||||
6,Kristina Izakoff,30.741991,121.341969
|
6,Kristina Izakoff,30.741991,121.341969
|
||||||
7,Ricky Sweeting,11.2666664,122.5333328
|
7,Ricky Sweeting,11.2666664,122.5333328
|
||||||
8,Quintin Hazart,35.119385,109.167435
|
8,Quintin Hazart,35.119385,109.167435
|
||||||
9,Sholom Kilmister,55.7393377,37.6642542
|
9,Sholom Kilmister,55.7393377,37.6642542
|
||||||
10,Misty Dooher,49.9776657,20.9421091
|
10,Misty Dooher,49.9776657,20.9421091
|
||||||
11,Knox Phython,-8.4985,123.5226
|
11,Knox Phython,-8.4985,123.5226
|
||||||
12,Shay Davidy,14.4142191,120.9495257
|
12,Shay Davidy,14.4142191,120.9495257
|
||||||
13,Dre Benoey,-31.4561755,-64.2111608
|
13,Dre Benoey,-31.4561755,-64.2111608
|
||||||
14,Prudi Tomek,40.692169,117.163821
|
14,Prudi Tomek,40.692169,117.163821
|
||||||
15,Evey Ealam,31.123586,114.893666
|
15,Evey Ealam,31.123586,114.893666
|
||||||
16,Norry Urch,45.8022541,17.497172
|
16,Norry Urch,45.8022541,17.497172
|
||||||
17,Valerye Dumberell,50.4438122,48.1450932
|
17,Valerye Dumberell,50.4438122,48.1450932
|
||||||
18,Freddy Furtado,58.3767785,11.6764538
|
18,Freddy Furtado,58.3767785,11.6764538
|
||||||
19,Catarina Samett,50.4034992,26.141892
|
19,Catarina Samett,50.4034992,26.141892
|
||||||
20,Lidia Muckian,-38.7359018,-72.5903739
|
20,Lidia Muckian,-38.7359018,-72.5903739
|
||||||
21,Stacey Dockrey,29.741986,106.273576
|
21,Stacey Dockrey,29.741986,106.273576
|
||||||
22,Norri Bonhill,60.6184239,16.7769535
|
22,Norri Bonhill,60.6184239,16.7769535
|
||||||
23,Florence Pretsel,55.96667,25.15
|
23,Florence Pretsel,55.96667,25.15
|
||||||
24,Marten Matantsev,50.9603536,14.3596743
|
24,Marten Matantsev,50.9603536,14.3596743
|
||||||
25,Claiborn Everall,43.884893,-0.5046003
|
25,Claiborn Everall,43.884893,-0.5046003
|
||||||
26,Randolf Hailey,49.4679131,18.2282007
|
26,Randolf Hailey,49.4679131,18.2282007
|
||||||
27,Meggi Kirkebye,57.6888453,11.9943311
|
27,Meggi Kirkebye,57.6888453,11.9943311
|
||||||
28,Denna Le Grove,16.7124054,98.5746649
|
28,Denna Le Grove,16.7124054,98.5746649
|
||||||
29,Randy Verheijden,40.4722617,-7.9751886
|
29,Randy Verheijden,40.4722617,-7.9751886
|
||||||
30,Caterina Blancowe,35.422892,103.352654
|
30,Caterina Blancowe,35.422892,103.352654
|
||||||
31,Joanne Adamovitch,55.9251242,39.4489055
|
31,Joanne Adamovitch,55.9251242,39.4489055
|
||||||
32,Orazio Coppins,,111.6556388
|
32,Orazio Coppins,,111.6556388
|
||||||
33,Anastassia Bennedsen,45.212088,130.478187
|
33,Anastassia Bennedsen,45.212088,130.478187
|
||||||
34,Linoel Ruggier,22.066171,107.781956
|
34,Linoel Ruggier,22.066171,107.781956
|
||||||
35,Paulina Moralis,-11.806679,-77.1657716
|
35,Paulina Moralis,-11.806679,-77.1657716
|
||||||
36,Ambur Outhwaite,59.4033695,17.9443213
|
36,Ambur Outhwaite,59.4033695,17.9443213
|
||||||
37,Laetitia Aspland,37.6086169,138.9089988
|
37,Laetitia Aspland,37.6086169,138.9089988
|
||||||
38,Dew Moxstead,6.1317011,-75.6382657
|
38,Dew Moxstead,6.1317011,-75.6382657
|
||||||
39,Berna Klaiser,40.1394691,-8.3092933
|
39,Berna Klaiser,40.1394691,-8.3092933
|
||||||
40,Krystle Ingold,7.1518505,0.4738293
|
40,Krystle Ingold,7.1518505,0.4738293
|
||||||
41,Cassaundra Cuffin,56.6342788,36.885813
|
41,Cassaundra Cuffin,56.6342788,36.885813
|
||||||
42,Malanie Harpin,46.9,109.75
|
42,Malanie Harpin,46.9,109.75
|
||||||
43,Laurence Stothart,39.912765,116.18362
|
43,Laurence Stothart,39.912765,116.18362
|
||||||
44,Luz O'Siaghail,40.4476834,25.5917918
|
44,Luz O'Siaghail,40.4476834,25.5917918
|
||||||
45,Brittni Garrod,59.0836123,16.18741
|
45,Brittni Garrod,59.0836123,16.18741
|
||||||
46,Karlie Semrad,-8.793392,121.9330894
|
46,Karlie Semrad,-8.793392,121.9330894
|
||||||
47,Leigh Allderidge,45.768045,15.947739
|
47,Leigh Allderidge,45.768045,15.947739
|
||||||
48,Ashlin Gogerty,50.3250139,34.9100068
|
48,Ashlin Gogerty,50.3250139,34.9100068
|
||||||
49,Mozelle De Launde,53.31611,40.70806
|
49,Mozelle De Launde,53.31611,40.70806
|
||||||
50,Ema le Keux,41.6315023,19.9310781
|
50,Ema le Keux,41.6315023,19.9310781
|
||||||
|
|||||||
|
5
tests/fixtures/snapshots/result.csv
vendored
Normal file
5
tests/fixtures/snapshots/result.csv
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"cluster_id","id","name","lat","lon"
|
||||||
|
0,1,"Alice",52.523955,13.442362
|
||||||
|
0,2,"Bob",52.526659,13.448097
|
||||||
|
1,3,"Carol",52.525626,13.419246
|
||||||
|
1,4,"Dan",52.52443559865125,13.41261723049818
|
||||||
|
1
tests/fixtures/snapshots/result.geojson
vendored
Normal file
1
tests/fixtures/snapshots/result.geojson
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.442362, 52.523955]}, "properties": {"id": 1, "name": "Alice", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.448097, 52.526659]}, "properties": {"id": 2, "name": "Bob", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.419246, 52.525626]}, "properties": {"id": 3, "name": "Carol", "cluster_id": 1}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.412617, 52.524436]}, "properties": {"id": 4, "name": "Dan", "cluster_id": 1}}]}
|
||||||
1
tests/fixtures/snapshots/result.json
vendored
Normal file
1
tests/fixtures/snapshots/result.json
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
[{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]
|
||||||
7
tests/fixtures/snapshots/result.txt
vendored
Normal file
7
tests/fixtures/snapshots/result.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
Cluster 0
|
||||||
|
id 1, name Alice, lat 52.523955, lon 13.442362
|
||||||
|
id 2, name Bob, lat 52.526659, lon 13.448097
|
||||||
|
|
||||||
|
Cluster 1
|
||||||
|
id 3, name Carol, lat 52.525626, lon 13.419246
|
||||||
|
id 4, name Dan, lat 52.52443559865125, lon 13.41261723049818
|
||||||
16
tests/helpers.py
Normal file
16
tests/helpers.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
import os
|
||||||
|
from geoclustering.io import read_csv_file
|
||||||
|
|
||||||
|
|
||||||
|
def get_fixture_path(filename):
|
||||||
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
return os.path.join(dir_path, "fixtures", filename)
|
||||||
|
|
||||||
|
|
||||||
|
def read_fixture_csv(filename):
|
||||||
|
return read_csv_file(get_fixture_path(filename))
|
||||||
|
|
||||||
|
|
||||||
|
def read_fixture_content(filename):
|
||||||
|
with open(get_fixture_path(filename)) as f:
|
||||||
|
return f.read()
|
||||||
25
tests/io.py
Normal file
25
tests/io.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import shutil
|
||||||
|
from geoclustering.io import write_output_file
|
||||||
|
from tests.helpers import read_fixture_csv
|
||||||
|
|
||||||
|
|
||||||
|
def test_csv_filters():
|
||||||
|
df = read_fixture_csv("io.csv")
|
||||||
|
# entries 2 & 5 in fixture are valid.
|
||||||
|
assert len(df) == 2
|
||||||
|
assert df.iloc[0]["name"] == None
|
||||||
|
assert df.iloc[1]["name"] == "Bob"
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_output_file():
|
||||||
|
p = "./this/dir/does/not/exist"
|
||||||
|
f = "test.txt"
|
||||||
|
write_output_file(p, f, "test")
|
||||||
|
|
||||||
|
path = Path(p) / f
|
||||||
|
|
||||||
|
with open(path) as f:
|
||||||
|
assert f.read() == "test"
|
||||||
|
|
||||||
|
shutil.rmtree(Path("./this"))
|
||||||
Reference in New Issue
Block a user