first commit
37
README.md
@@ -1,2 +1,35 @@
|
||||
# sugartrail
|
||||
Network analysis tool for locating suspicious directors, locations and companies via Companies House
|
||||
# Sugartrail
|
||||
|
||||
## Tool Description
|
||||
|
||||
Sugartrail is a work-in-progress network analysis tool and workflow that helps researchers to use a suspicious company director to discover other suspicious companies, directors and locations through Companies House.
|
||||
|
||||
The workflow is based on the following observations:
|
||||
|
||||
- suspicious directors often have many active appointments registered to multiple historic addresses
|
||||
- addresses with many registered businesses can contain multiple scam businesses
|
||||
|
||||
## Requirements
|
||||
|
||||
You will require an API key from Companies House to authenticate with the API. First you will need to create a live application to get an API key which you can do by following the [Companies House guide](https://developer.company-information.service.gov.uk/how-to-create-an-application). You will then need to manually hard-code the API key inside the `sugartrail.py` script as the value for `access_token`.
|
||||
|
||||
## Installation
|
||||
|
||||
1. Make sure you have Conda installed
|
||||
|
||||
2. Download the tool's repository using the command:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ribenamaplesyrup/sugartrail.git
|
||||
```
|
||||
|
||||
3. Navigate to the main directory and run:
|
||||
|
||||
```bash
|
||||
conda env create -f environment.yml
|
||||
conda activate candystore
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
- A walkthrough of how to use the tool is included in the linked Jupyter notebook showing how we can get from suspicious Candy Stores of Oxford Street to several prolific scammers.
|
||||
|
||||
1529
Tutorial 1 - Exit Through the Candy Shop.ipynb
Normal file
BIN
assets/candyy.jpeg
Normal file
|
After Width: | Height: | Size: 1.4 MiB |
BIN
assets/consultancy_house.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
BIN
assets/diamond.png
Normal file
|
After Width: | Height: | Size: 165 KiB |
BIN
assets/dm.png
Normal file
|
After Width: | Height: | Size: 119 KiB |
BIN
assets/euro.png
Normal file
|
After Width: | Height: | Size: 2.0 MiB |
BIN
assets/exclusive.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
assets/hold.png
Normal file
|
After Width: | Height: | Size: 1.7 MiB |
BIN
assets/review.png
Normal file
|
After Width: | Height: | Size: 51 KiB |
111
environment.yml
Normal file
@@ -0,0 +1,111 @@
|
||||
name: candystore
|
||||
channels:
|
||||
- anaconda
|
||||
- defaults
|
||||
dependencies:
|
||||
- appnope=0.1.2=py39hecd8cb5_1001
|
||||
- argon2-cffi=21.3.0=pyhd3eb1b0_0
|
||||
- argon2-cffi-bindings=21.2.0=py39hca72f7f_0
|
||||
- asttokens=2.0.5=pyhd3eb1b0_0
|
||||
- attrs=21.4.0=pyhd3eb1b0_0
|
||||
- backcall=0.2.0=pyhd3eb1b0_0
|
||||
- beautifulsoup4=4.11.1=py39hecd8cb5_0
|
||||
- blas=1.0=mkl
|
||||
- bleach=4.1.0=pyhd3eb1b0_0
|
||||
- bottleneck=1.3.4=py39h67323c0_0
|
||||
- ca-certificates=2022.4.26=hecd8cb5_0
|
||||
- certifi=2022.6.15=py39hecd8cb5_0
|
||||
- cffi=1.15.0=py39hc55c11b_1
|
||||
- debugpy=1.5.1=py39he9d5cce_0
|
||||
- decorator=5.1.1=pyhd3eb1b0_0
|
||||
- defusedxml=0.7.1=pyhd3eb1b0_0
|
||||
- entrypoints=0.4=py39hecd8cb5_0
|
||||
- executing=0.8.3=pyhd3eb1b0_0
|
||||
- icu=58.2=h0a44026_3
|
||||
- intel-openmp=2021.4.0=hecd8cb5_3538
|
||||
- ipykernel=6.9.1=py39hecd8cb5_0
|
||||
- ipython=8.3.0=py39hecd8cb5_0
|
||||
- ipython_genutils=0.2.0=pyhd3eb1b0_1
|
||||
- ipywidgets=7.6.5=pyhd3eb1b0_1
|
||||
- jedi=0.18.1=py39hecd8cb5_1
|
||||
- jinja2=3.0.3=pyhd3eb1b0_0
|
||||
- jpeg=9e=hca72f7f_0
|
||||
- jsonschema=4.4.0=py39hecd8cb5_0
|
||||
- jupyter=1.0.0=py39hecd8cb5_7
|
||||
- jupyter_client=7.2.2=py39hecd8cb5_0
|
||||
- jupyter_console=6.4.3=pyhd3eb1b0_0
|
||||
- jupyter_core=4.10.0=py39hecd8cb5_0
|
||||
- jupyterlab_pygments=0.1.2=py_0
|
||||
- jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
|
||||
- libcxx=12.0.0=h2f01273_0
|
||||
- libffi=3.3=hb1e8313_2
|
||||
- libpng=1.6.37=ha441bb4_0
|
||||
- libsodium=1.0.18=h1de35cc_0
|
||||
- markupsafe=2.1.1=py39hca72f7f_0
|
||||
- matplotlib-inline=0.1.2=pyhd3eb1b0_2
|
||||
- mistune=0.8.4=py39h9ed2024_1000
|
||||
- mkl=2021.4.0=hecd8cb5_637
|
||||
- mkl-service=2.4.0=py39h9ed2024_0
|
||||
- mkl_fft=1.3.1=py39h4ab4a9b_0
|
||||
- mkl_random=1.2.2=py39hb2f4e1b_0
|
||||
- nbclient=0.5.13=py39hecd8cb5_0
|
||||
- nbconvert=6.4.4=py39hecd8cb5_0
|
||||
- nbformat=5.3.0=py39hecd8cb5_0
|
||||
- ncurses=6.3=hca72f7f_3
|
||||
- nest-asyncio=1.5.5=py39hecd8cb5_0
|
||||
- notebook=6.4.11=py39hecd8cb5_0
|
||||
- numexpr=2.8.1=py39h2e5f0a9_2
|
||||
- numpy=1.22.3=py39h2e5f0a9_0
|
||||
- numpy-base=1.22.3=py39h3b1a694_0
|
||||
- openssl=1.1.1o=hca72f7f_0
|
||||
- packaging=21.3=pyhd3eb1b0_0
|
||||
- pandas=1.4.2=py39he9d5cce_0
|
||||
- pandocfilters=1.5.0=pyhd3eb1b0_0
|
||||
- parso=0.8.3=pyhd3eb1b0_0
|
||||
- pexpect=4.8.0=pyhd3eb1b0_3
|
||||
- pickleshare=0.7.5=pyhd3eb1b0_1003
|
||||
- pip=22.1.2=py39hecd8cb5_0
|
||||
- prometheus_client=0.13.1=pyhd3eb1b0_0
|
||||
- prompt-toolkit=3.0.20=pyhd3eb1b0_0
|
||||
- prompt_toolkit=3.0.20=hd3eb1b0_0
|
||||
- ptyprocess=0.7.0=pyhd3eb1b0_2
|
||||
- pure_eval=0.2.2=pyhd3eb1b0_0
|
||||
- pycparser=2.21=pyhd3eb1b0_0
|
||||
- pygments=2.11.2=pyhd3eb1b0_0
|
||||
- pyparsing=3.0.4=pyhd3eb1b0_0
|
||||
- pyqt=5.9.2=py39h23ab428_6
|
||||
- pyrsistent=0.18.0=py39hca72f7f_0
|
||||
- python=3.9.12=hdfd78df_1
|
||||
- python-dateutil=2.8.2=pyhd3eb1b0_0
|
||||
- python-fastjsonschema=2.15.1=pyhd3eb1b0_0
|
||||
- pytz=2022.1=py39hecd8cb5_0
|
||||
- pyzmq=22.3.0=py39he9d5cce_2
|
||||
- qt=5.9.7=h468cd18_1
|
||||
- qtconsole=5.3.0=pyhd3eb1b0_0
|
||||
- qtpy=2.0.1=pyhd3eb1b0_0
|
||||
- readline=8.1.2=hca72f7f_1
|
||||
- send2trash=1.8.0=pyhd3eb1b0_1
|
||||
- setuptools=63.4.1=py39hecd8cb5_0
|
||||
- sip=4.19.13=py39h23ab428_0
|
||||
- six=1.16.0=pyhd3eb1b0_1
|
||||
- soupsieve=2.3.1=pyhd3eb1b0_0
|
||||
- sqlite=3.39.2=h707629a_0
|
||||
- stack_data=0.2.0=pyhd3eb1b0_0
|
||||
- terminado=0.13.1=py39hecd8cb5_0
|
||||
- testpath=0.6.0=py39hecd8cb5_0
|
||||
- tk=8.6.12=h5d9f67b_0
|
||||
- tornado=6.1=py39h9ed2024_0
|
||||
- traitlets=5.1.1=pyhd3eb1b0_0
|
||||
- typing-extensions=4.1.1=hd3eb1b0_0
|
||||
- typing_extensions=4.1.1=pyh06a4308_0
|
||||
- tzdata=2022a=hda174b7_0
|
||||
- wcwidth=0.2.5=pyhd3eb1b0_0
|
||||
- webencodings=0.5.1=py39hecd8cb5_1
|
||||
- wheel=0.37.1=pyhd3eb1b0_0
|
||||
- widgetsnbextension=3.5.2=py39hecd8cb5_0
|
||||
- xz=5.2.5=hca72f7f_1
|
||||
- zeromq=4.3.4=h23ab428_0
|
||||
- zlib=1.2.12=h4dc903c_2
|
||||
- pip:
|
||||
- chwrapper==0.3.0
|
||||
- requests==2.8.1
|
||||
147
sugartrail.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from requests.auth import HTTPBasicAuth
|
||||
import requests
|
||||
import pandas as pd
|
||||
import sys
|
||||
from IPython.display import clear_output
|
||||
import time
|
||||
import collections
|
||||
from datetime import datetime
|
||||
import math
|
||||
access_token = ""
|
||||
username = access_token
|
||||
password = ""
|
||||
size = "5000"
|
||||
basic = HTTPBasicAuth(username, password)
|
||||
|
||||
def get_appointments(officer_id):
|
||||
url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments?size=" + size
|
||||
response = requests.get(url, auth=basic)
|
||||
# print metadata
|
||||
df = pd.DataFrame(response.json()['items'])
|
||||
appointments = len(df)
|
||||
print(str(appointments) + " appointments")
|
||||
print(str(appointments - df["resigned_on"].count()) + " active appointments")
|
||||
return response.json()
|
||||
|
||||
def get_locations(companies, address_type: str):
|
||||
df = pd.DataFrame(companies['items'])
|
||||
if address_type == "correspondance":
|
||||
postcode = [address['postal_code'] for address in df['address']]
|
||||
addresses = [address['premises'] + ", " + address['address_line_1'] + ", " + address['locality'] + ", " + address['country'] + ", " + address['postal_code'] for address in df['address']]
|
||||
elif address_type == "registered":
|
||||
addresses = []
|
||||
keys = ["address_line_1","address_line_2","country","locality","postal_code"]
|
||||
for link in df['links']:
|
||||
url = "https://api.company-information.service.gov.uk" + link['company'] + "/registered-office-address"
|
||||
response = requests.get(url, auth=basic)
|
||||
address = []
|
||||
postcode = []
|
||||
for key in keys:
|
||||
if key in response.json():
|
||||
address += [response.json()[key]]
|
||||
if key == "postal_code":
|
||||
postcode += [response.json()[key]]
|
||||
address = ", ".join(address)
|
||||
addresses += [address]
|
||||
else:
|
||||
print("unrecognised address type: should be either corresponance or registered")
|
||||
return None
|
||||
postcode_frequency = dict(collections.Counter(postcode).items(), key=lambda item: item[1], reverse=True)
|
||||
print(str(len(postcode_frequency)) + " unique postcodes")
|
||||
frequency = dict(sorted(collections.Counter(addresses).items(), key=lambda item: item[1], reverse=True))
|
||||
print(str(len(frequency)) + " unique " + address_type + " addresses")
|
||||
print(frequency)
|
||||
return addresses
|
||||
|
||||
def year_of_creation(companies):
|
||||
years = [address['date_of_creation'][0:4] for address in companies]
|
||||
frequency = collections.Counter(years)
|
||||
return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
|
||||
|
||||
def age(creation: str, cessation: str):
|
||||
delta = datetime.strptime(cessation, "%Y-%m-%d")-datetime.strptime(creation, "%Y-%m-%d")
|
||||
return math.floor(delta.days/365)
|
||||
|
||||
|
||||
def get_companies(addresses):
|
||||
companies = {}
|
||||
companies_summary = {}
|
||||
for address in addresses:
|
||||
url = "https://api.company-information.service.gov.uk/advanced-search/companies?location=" + address + "&size=" + size
|
||||
response = requests.get(url, auth=basic)
|
||||
if response.status_code == 200:
|
||||
companies[address] = response.json()['items']
|
||||
companies_summary[address] = {}
|
||||
companies_summary[address]["frequency"] = response.json()['hits']
|
||||
all_companies = [address for address in response.json()['items']]
|
||||
active_companies = [address for address in response.json()['items'] if address['company_status'] == 'active']
|
||||
dead_companies = [address for address in response.json()['items'] if address['company_status'] == 'dissolved']
|
||||
companies_summary[address]["active_companies"] = len(active_companies)
|
||||
years = year_of_creation(all_companies)
|
||||
survival_months = [age(address['date_of_creation'],address['date_of_cessation']) for address in dead_companies]
|
||||
survival_frequency = collections.Counter(survival_months)
|
||||
survival_frequency = dict(sorted(survival_frequency.items(), key=lambda item: item[1], reverse=True))
|
||||
active_years = year_of_creation(active_companies)
|
||||
companies_summary[address]["3_years_active"] = {k: active_years[k] for k in list(active_years)[:3]}
|
||||
companies_summary[address]["3_years_all"] = {k: years[k] for k in list(years)[:3]}
|
||||
companies_summary[address]["3_survival"] = {k: survival_frequency[k] for k in list(survival_frequency)[:3]}
|
||||
companies_summary = dict(sorted(companies_summary.items(), key=lambda item: item[1]["frequency"],reverse=True))
|
||||
for i,company in enumerate(companies_summary):
|
||||
print("Index: " + str(i))
|
||||
print(company)
|
||||
print(str(companies_summary[company]['frequency']) + " companies registered or corresponding here, " + str(companies_summary[company]['active_companies']) + " are active.")
|
||||
keys = list(companies_summary[company]['3_years_active'].keys())
|
||||
life_keys = list(companies_summary[company]['3_survival'].keys())
|
||||
for key in keys:
|
||||
print(str(companies_summary[company]['3_years_active'][key]) + " currently active companies registered in " + str(key))
|
||||
for key in life_keys:
|
||||
print(str(companies_summary[company]['3_survival'][key]) + " companies dissolved between years " + str(key+1) + "-" + str(key))
|
||||
print("")
|
||||
|
||||
return {key: companies[key] for key in companies_summary if key in companies}
|
||||
|
||||
def get_officers(company_locations, indices):
|
||||
officers = {}
|
||||
for index in indices:
|
||||
# get businesses at location
|
||||
company_name = list(company_locations.keys())[index]
|
||||
officers[str(company_name)] = []
|
||||
companies = company_locations[company_name]
|
||||
length = len(companies)
|
||||
for i, business in enumerate(companies):
|
||||
company_number = business['company_number']
|
||||
url = "https://api.company-information.service.gov.uk/company/" + company_number + "/officers?size=" + size
|
||||
while True:
|
||||
try:
|
||||
clear_output(wait=True)
|
||||
print("completion: " + str(100*i/length) + ", index:" + str(i))
|
||||
leadership = requests.get(url, auth=basic)
|
||||
print(leadership)
|
||||
if leadership.json():
|
||||
officers[str(company_name)] += [[officer['name'] for officer in leadership.json()['items']]]
|
||||
clear_output(wait=True)
|
||||
time.sleep(0.41)
|
||||
break
|
||||
else:
|
||||
officers[str(company_name)] += [[]]
|
||||
clear_output(wait=True)
|
||||
time.sleep(0.41)
|
||||
break
|
||||
except:
|
||||
print(sys.exc_info()[0])
|
||||
print("taking a 10 second timeout")
|
||||
time.sleep(10)
|
||||
clear_output(wait=True)
|
||||
for location in list(officers.keys()):
|
||||
directors = []
|
||||
for business in officers[location]:
|
||||
directors += business
|
||||
frequency = collections.Counter(directors)
|
||||
frequency = dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
|
||||
print(location)
|
||||
print("-")
|
||||
print("Most prolific officers:")
|
||||
for officer in list(frequency):
|
||||
print(str(officer) + " runs " + str(frequency[str(officer)]) + " businesses")
|
||||
print("")
|
||||
return officers
|
||||