mirror of
https://github.com/bellingcat/sugartrail.git
synced 2026-06-08 03:28:31 +03:00
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
BIN
candystreet.png
BIN
candystreet.png
Binary file not shown.
|
Before Width: | Height: | Size: 2.9 MiB |
@@ -402,10 +402,10 @@
|
||||
"def generate_download_zip(company_text, network):\n",
|
||||
" Path(str(company_text.value)).mkdir(parents=True, exist_ok=True)\n",
|
||||
" df_company_ids = pd.DataFrame(network.company_ids)\n",
|
||||
" df_companies = pd.DataFrame(network.companies)\n",
|
||||
" df_company_records = pd.DataFrame(network.company_records)\n",
|
||||
" df_addresses = pd.DataFrame(network.addresses)\n",
|
||||
" df_officer_ids = pd.DataFrame(network.officer_ids)\n",
|
||||
" files = {'companies': df_company_ids, 'addresses': df_addresses, 'officers': df_officer_ids, 'company_details': df_companies}\n",
|
||||
" files = {'companies': df_company_ids, 'addresses': df_addresses, 'officers': df_officer_ids, 'company_details': df_company_records}\n",
|
||||
" for key in files:\n",
|
||||
" files[key].to_csv(str(company_text.value) + '/' + key + '.csv')\n",
|
||||
" file = str(company_text.value) + '.json'\n",
|
||||
@@ -429,7 +429,7 @@
|
||||
" with tab.children[2]:\n",
|
||||
" display(pd.DataFrame(network.officer_ids))\n",
|
||||
" with tab.children[3]:\n",
|
||||
" display(pd.DataFrame(network.companies))\n",
|
||||
" display(pd.DataFrame(network.company_records))\n",
|
||||
" zip_filename = generate_download_zip(company_text, network)\n",
|
||||
" html_button = html_buttons.format(filename=zip_filename)\n",
|
||||
" with download_link:\n",
|
||||
|
||||
@@ -162,7 +162,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e12f5461",
|
||||
"id": "11b129ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -174,7 +174,7 @@
|
||||
"id": "91c14cbb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Each company is represented by its unique ID (`company_id`), number of hops from the origin company (`n`) and the company, address or person it connects to. As we've only saved the origin company so far, there isn't any information on links or connected nodes. There are also attributes for storing officer ids (`officer_ids`) and (`addresses`) although they have no information in them yet:"
|
||||
"Each company is represented by its unique ID (`company_id`), name (`title`), number of hops from the origin company (`depth`) and the company, address or person it connects to. As we've only saved the origin company so far, there isn't any information on links or connected nodes. There are also attributes for storing officer ids (`officer_ids`) and (`addresses`) although they have no information in them yet:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -292,7 +292,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(network.company_ids)"
|
||||
"network.company_ids"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -310,7 +310,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(network.officer_ids)"
|
||||
"network.officer_ids"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -329,6 +329,24 @@
|
||||
"id": "7083402a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"network.addresses"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb8b7408",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can load multiple results into a DataFrame for better readability:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9240d709",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(network.addresses)"
|
||||
]
|
||||
@@ -348,7 +366,7 @@
|
||||
"id": "b4828d92",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For reproducibility, each time we perform a hop, the methods and limit configs are stored in "
|
||||
"For reproducibility, each time we perform a hop, the methods and limit configs are stored in `hop_history` which we can view through `print_hop_history`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -358,7 +376,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(network.hop_history)"
|
||||
"network.hop_history"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -402,7 +420,7 @@
|
||||
"id": "dfa1b90c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To see the information added, we can check out `address_history` and `companies` properties of our class:"
|
||||
"To see the information added, we can check out `address_history` and `companies`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -422,7 +440,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(network.companies)"
|
||||
"pd.DataFrame(network.company_records)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"id": "9c8ebc89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -168,7 +168,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"id": "df617fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -198,28 +198,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"id": "7bdde00f",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d68aba9065b4429e9852696d97be010d",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(Map(center=[50, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title',…"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# generate map\n",
|
||||
"map_data,path_table = mapview.build_map(western_crown_network) \n",
|
||||
@@ -245,7 +229,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(western_crown_network.find_path('10289650'))[['node_index', 'node', 'hop', 'node_type', 'link']]"
|
||||
"pd.DataFrame(western_crown_network.find_path('10540083'))[['node_index', 'title', 'id', 'depth', 'node_type', 'link_type', 'link']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -290,7 +274,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"*Active Companies*\n",
|
||||
"- [CANDY TOWN LTD](https://find-and-update.company-information.service.gov.uk/company/11464159) (1464159)\n",
|
||||
"- [CANDY TOWN LTD](https://find-and-update.company-information.service.gov.uk/company/11464159) (11464159)\n",
|
||||
"- [ESPANZA LIMITED](https://find-and-update.company-information.service.gov.uk/company/11474248) (11474248)\n",
|
||||
"\n",
|
||||
"*Dissolved Companies*\n",
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
"source": [
|
||||
"officer_id = \"Nd2URspq4bvLy-hwzDZ0_p7FGJw\"\n",
|
||||
"network = base.Network(officer_id=officer_id)\n",
|
||||
"network.perform_hop(2)"
|
||||
"network.perform_hop(3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -73,7 +73,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(network.addresses)['address'].unique()"
|
||||
"pd.DataFrame(network.addresses)['title'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -139,16 +139,6 @@
|
||||
"Although lets pause to briefly explore what address would have thousands of companies registered there?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8bb8bdf1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"network.maxsize_entities[2]['node']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e8644d6b",
|
||||
@@ -276,7 +266,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.DataFrame(regent_street_network.officer_ids)['name'].value_counts()"
|
||||
"pd.DataFrame(regent_street_network.officer_ids)['title'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -2,3 +2,4 @@ from . import api
|
||||
from . import base
|
||||
from . import processing
|
||||
from . import mapview
|
||||
from . import hop
|
||||
|
||||
@@ -2,24 +2,21 @@ import sugartrail
|
||||
import IPython
|
||||
import json
|
||||
import functools
|
||||
from string import ascii_letters as alc
|
||||
import pandas as pd
|
||||
|
||||
class Network:
|
||||
"""Class represents a network of connected companies, officers and
|
||||
addresses. Class contains methods to build network of user defined size from
|
||||
a single seed company, officer or address."""
|
||||
def __init__(self, officer_id=None, company_id=None, address=None, file=None):
|
||||
self.addresses = []
|
||||
self.officer_ids = []
|
||||
self.company_ids = []
|
||||
self.companies = []
|
||||
self.graph = {}
|
||||
self.company_records = []
|
||||
self.address_history = []
|
||||
self._officer_id = officer_id
|
||||
self._company_id = company_id
|
||||
self._address = address
|
||||
self.n = 0
|
||||
self.link_type = None
|
||||
self.hop = self.Hop()
|
||||
self.hop = sugartrail.hop.Hop()
|
||||
self.hop_history = []
|
||||
self.maxsize_entities = []
|
||||
self.processed_officers = []
|
||||
@@ -32,10 +29,8 @@ class Network:
|
||||
"""Resets the class attributes to pre-init state."""
|
||||
@functools.wraps(func)
|
||||
def wrapper_clear(*args, **kwargs):
|
||||
args[0].addresses = []
|
||||
args[0].officer_ids = []
|
||||
args[0].company_ids = []
|
||||
args[0].companies = []
|
||||
args[0].graph = {}
|
||||
args[0].company_records = []
|
||||
args[0].address_history = []
|
||||
args[0]._officer_id = None
|
||||
args[0]._company_id = None
|
||||
@@ -62,18 +57,43 @@ class Network:
|
||||
officer_info = sugartrail.api.get_appointments(new_value)
|
||||
if officer_info:
|
||||
self._officer_id = new_value
|
||||
self.officer_ids = [{
|
||||
'officer_id': new_value,
|
||||
'name': officer_info['items'][0]['name'],
|
||||
'n':self.n,
|
||||
'link_type': None,
|
||||
'node_type': None,
|
||||
'node': None
|
||||
}]
|
||||
self.graph = {
|
||||
new_value: {
|
||||
'title': officer_info['items'][0]['name'],
|
||||
'depth':self.n,
|
||||
'node_type': "Person",
|
||||
'arcs': []
|
||||
}
|
||||
}
|
||||
else:
|
||||
print(f"Officer with ID:{str(new_value)} not found")
|
||||
self._officer_id = None
|
||||
|
||||
@property
|
||||
def officer_ids(self):
|
||||
"""Get all officers from graph."""
|
||||
officer_ids = {k: v for k, v in self.graph.items() if v['node_type'] == 'Person'}
|
||||
officer_table = []
|
||||
for officer_id, officer_data in officer_ids.items():
|
||||
officer = {
|
||||
"officer_id": officer_id,
|
||||
"title": officer_data['title'],
|
||||
"depth": officer_data['depth'],
|
||||
"title": officer_data['title'],
|
||||
'link_type': '',
|
||||
'link': ''
|
||||
}
|
||||
if not officer_data['arcs']:
|
||||
officer_table.append(officer)
|
||||
else:
|
||||
for arc in officer_data['arcs']:
|
||||
officer.update({
|
||||
'link_type': arc['arc_type'],
|
||||
'link': arc['start_node']
|
||||
})
|
||||
officer_table.append(officer)
|
||||
return officer_table
|
||||
|
||||
@property
|
||||
def company_id(self):
|
||||
"""company_id property representing seed company."""
|
||||
@@ -86,18 +106,43 @@ class Network:
|
||||
company_info = sugartrail.api.get_company(new_value)
|
||||
if company_info:
|
||||
self._company_id = new_value
|
||||
self.company_ids = [{
|
||||
'company_id': self._company_id,
|
||||
'n':self.n,
|
||||
'link_type': '',
|
||||
'node_type': '',
|
||||
'node': ''
|
||||
}]
|
||||
self.companies = [dict(sugartrail.processing.flatten(company_info))]
|
||||
self.graph = {
|
||||
new_value: {
|
||||
'title': company_info['company_name'],
|
||||
'depth':self.n,
|
||||
'node_type': "Company",
|
||||
'arcs': []
|
||||
}
|
||||
}
|
||||
# self.companies = [dict(sugartrail.processing.flatten(company_info))]
|
||||
else:
|
||||
print(f"Company with ID:{str(new_value)} not found")
|
||||
self._company_id = None
|
||||
|
||||
@property
|
||||
def company_ids(self):
|
||||
company_ids = {k: v for k, v in self.graph.items() if v['node_type'] == 'Company'}
|
||||
company_table = []
|
||||
for company_id, company_data in company_ids.items():
|
||||
company = {
|
||||
"company_id": company_id,
|
||||
"title": company_data['title'],
|
||||
"depth": company_data['depth'],
|
||||
"title": company_data['title'],
|
||||
'link_type': '',
|
||||
'link': ''
|
||||
}
|
||||
if not company_data['arcs']:
|
||||
company_table.append(company)
|
||||
else:
|
||||
for arc in company_data['arcs']:
|
||||
company.update({
|
||||
'link_type': arc['arc_type'],
|
||||
'link': arc['start_node']
|
||||
})
|
||||
company_table.append(company)
|
||||
return company_table
|
||||
|
||||
@property
|
||||
def address(self, value):
|
||||
"""address property representing seed address."""
|
||||
@@ -108,13 +153,38 @@ class Network:
|
||||
def address(self, new_value):
|
||||
"""address setter."""
|
||||
self._address = new_value
|
||||
self.addresses = [dict({
|
||||
'address': self._address,
|
||||
'n':self.n,
|
||||
'link_type': '',
|
||||
'node_type': '',
|
||||
'node': ''
|
||||
})]
|
||||
self.graph = {
|
||||
new_value: {
|
||||
'title': new_value,
|
||||
'depth':self.n,
|
||||
'node_type': "Address",
|
||||
'arcs': []
|
||||
}
|
||||
}
|
||||
|
||||
@property
|
||||
def addresses(self):
|
||||
addresses = {k: v for k, v in self.graph.items() if v['node_type'] == 'Address'}
|
||||
address_table = []
|
||||
for address_string, address_data in addresses.items():
|
||||
address = {
|
||||
"address": address_string,
|
||||
"title": address_data['title'],
|
||||
"depth": address_data['depth'],
|
||||
"title": address_data['title'],
|
||||
'link_type': '',
|
||||
'link': ''
|
||||
}
|
||||
if not address_data['arcs']:
|
||||
address_table.append(address)
|
||||
else:
|
||||
for arc in address_data['arcs']:
|
||||
address.update({
|
||||
'link_type': arc['arc_type'],
|
||||
'link': arc['start_node']
|
||||
})
|
||||
address_table.append(address)
|
||||
return address_table
|
||||
|
||||
@property
|
||||
def file(self):
|
||||
@@ -155,10 +225,8 @@ class Network:
|
||||
if filename:
|
||||
f = open(f'../assets/networks/{filename}')
|
||||
network_data = json.load(f)
|
||||
self.addresses = network_data['addresses']
|
||||
self.officer_ids = network_data['officer_ids']
|
||||
self.company_ids = network_data['company_ids']
|
||||
self.companies = network_data['companies']
|
||||
self.graph = network_data['graph']
|
||||
self.company_records = network_data['company_records']
|
||||
self.address_history = network_data['address_history']
|
||||
self._officer_id = network_data['_officer_id']
|
||||
self._company_id = network_data['_company_id']
|
||||
@@ -174,233 +242,157 @@ class Network:
|
||||
def run_map_preprocessing(self):
|
||||
"""Gets missing/additional information on companies and addresses required for
|
||||
mapping them. This includes address histories, company records and coordinates."""
|
||||
self.get_address_histories()
|
||||
self.get_network_edge_address_histories()
|
||||
self.get_company_records_from_id()
|
||||
self.get_coords()
|
||||
return
|
||||
|
||||
def get_address_histories(self):
|
||||
"""Gets missing address histories for companies at the edge of the network."""
|
||||
historic_address_company_ids = list(dict.fromkeys([company['company_number'] for company in self.address_history]))
|
||||
for i, company in enumerate(self.company_ids):
|
||||
IPython.display.clear_output(wait=True)
|
||||
print("Updated " + str(i+1) + "/" + str(len(self.company_ids)) + " company addresses.")
|
||||
# if company is at the edge of the network:
|
||||
# if historic address not in
|
||||
if company['company_id'] not in historic_address_company_ids:
|
||||
historic_address_company_ids.append(company['company_id'])
|
||||
address_history = sugartrail.processing.build_address_history(company['company_id'])
|
||||
historic_addresses = []
|
||||
for historic_address in address_history:
|
||||
if historic_address not in self.address_history:
|
||||
historic_addresses.append(historic_address)
|
||||
self.address_history.extend(historic_addresses)
|
||||
|
||||
def get_company_records_from_id(self, company_df=None, print_progress=True):
|
||||
"""Gets company records for all company IDs in the network. Additionally
|
||||
enriches company_ids with company names for improved readability."""
|
||||
company_list = [company['company_id'] for company in self.company_ids]
|
||||
companies = []
|
||||
"""Gets company records for all company IDs in the network."""
|
||||
company_list = [item for item in self.graph.keys() if self.graph[item]['node_type'] == 'Company']
|
||||
company_records = []
|
||||
for i, company_id in enumerate(company_list):
|
||||
IPython.display.clear_output(wait=True)
|
||||
if print_progress:
|
||||
print("Processed " + str(i+1) + "/" + str(len(company_list)) + " companies.")
|
||||
if company_id not in [company['company_number'] for company in self.companies]:
|
||||
# if using local Companies House data
|
||||
if company_id not in [company['company_number'] for company in self.company_records]:
|
||||
if company_df is not None:
|
||||
try:
|
||||
company = company_df[company_df[" CompanyNumber"] == str(company_id)]["CompanyName"].item()
|
||||
if company:
|
||||
companies.append(company)
|
||||
company_records.append(company)
|
||||
except:
|
||||
try:
|
||||
company = sugartrail.api.get_company(company_id)
|
||||
if company:
|
||||
companies.append(company)
|
||||
company_records.append(company)
|
||||
except:
|
||||
print(f"Failed to get data for {company_id}")
|
||||
# otherwise uses API
|
||||
else:
|
||||
company = sugartrail.api.get_company(company_id)
|
||||
if company:
|
||||
companies.append(company)
|
||||
# update company_ids with company name
|
||||
self.company_ids[i]['company_name'] = company['company_name']
|
||||
else:
|
||||
self.company_ids[i]['company_name'] = list(filter(lambda d: d.get('company_number') == company_id, self.companies))[0]['company_name']
|
||||
self.companies.extend(companies)
|
||||
company_records.append(company)
|
||||
self.company_records.extend(company_records)
|
||||
|
||||
def get_network_edge_address_histories(self):
|
||||
"""Gets missing address histories for companies at the edge of the network."""
|
||||
if self.hop.get_company_address_history:
|
||||
network_edge_companies = []
|
||||
for item in self.graph.keys():
|
||||
if self.graph[item]['depth'] == self.n and self.graph[item]['node_type'] == 'Company':
|
||||
network_edge_companies.append(item)
|
||||
for i, company in enumerate(network_edge_companies):
|
||||
IPython.display.clear_output(wait=True)
|
||||
print("Processed " + str(i+1) + "/" + str(len(network_edge_companies)) + " company addresses.")
|
||||
# get company address history
|
||||
address_history = sugartrail.processing.build_address_history(company)
|
||||
if address_history:
|
||||
# self.address_history.extend(address_history)
|
||||
for address in address_history:
|
||||
if 'address' in address:
|
||||
self.address_history.append(address)
|
||||
new_address = address['address']
|
||||
if new_address not in self.graph:
|
||||
self.graph[new_address] = {
|
||||
'depth': self.n+1,
|
||||
'title': new_address,
|
||||
'node_type': "Address",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Historic Address",
|
||||
'start_node': company
|
||||
}
|
||||
if arc not in self.graph[new_address]['arcs'] and self.graph[new_address]['depth'] == self.n+1:
|
||||
self.graph[new_address]['arcs'].append(arc)
|
||||
|
||||
def get_coords(self):
|
||||
"""Gets coordinates for each address in addresses and address_history."""
|
||||
for i, row in enumerate(self.addresses):
|
||||
address_coords = {}
|
||||
for i, address in enumerate(self.address_history):
|
||||
IPython.display.clear_output(wait=True)
|
||||
print("Processed " + str(i+1) + "/" + str(len(self.addresses)) + " addresses.")
|
||||
if 'lat' not in row or 'lon' not in row:
|
||||
coords = sugartrail.processing.get_coords_from_address(row['address'])
|
||||
print("Processed " + str(i+1) + "/" + str(len(self.address_history)) + " addresses.")
|
||||
if address['address'] not in address_coords:
|
||||
coords = sugartrail.processing.get_coords_from_address(address['address'])
|
||||
if coords:
|
||||
self.addresses[i]['lat'] = coords['lat']
|
||||
self.addresses[i]['lon'] = coords['lon']
|
||||
historic_addresses = list(filter(lambda d: d.get('address') == row['address'], self.address_history))
|
||||
for j, historic_address in enumerate(self.address_history):
|
||||
if historic_address['address'] == row['address']:
|
||||
self.address_history[j]['lon'] = coords['lon']
|
||||
self.address_history[j]['lat'] = coords['lat']
|
||||
address_coords[address['address']] = {'lat': coords['lat'], 'lon': coords['lon']}
|
||||
else:
|
||||
# no coords found
|
||||
self.addresses[i]['lat'] = ""
|
||||
self.addresses[i]['lon'] = ""
|
||||
address_coords[address['address']] = {'lat': '', 'lon': ''}
|
||||
self.address_history[i]['lat'] = address_coords[address['address']]['lat']
|
||||
self.address_history[i]['lon'] = address_coords[address['address']]['lon']
|
||||
self.graph[address['address']]['lat'] = address_coords[address['address']]['lat']
|
||||
self.graph[address['address']]['lon'] = address_coords[address['address']]['lon']
|
||||
|
||||
def find_path(self, select_company):
|
||||
def find_path(self, company_id):
|
||||
"""Finds path from 'select_company' to origin company'."""
|
||||
# retrieve rows containing selected company:
|
||||
network_link_type_rows = list(filter(lambda d: d.get('company_id') == select_company, self.company_ids))
|
||||
path = []
|
||||
# iterate through each path from selected company to seed company:
|
||||
for i, row in enumerate(network_link_type_rows):
|
||||
# insert end of path node:
|
||||
path.insert(0, {
|
||||
'hop': row['n'],
|
||||
"type": "Company",
|
||||
"id": select_company,
|
||||
"node": row['company_name'],
|
||||
"node_type": row['link_type'],
|
||||
"link_id": row['node']
|
||||
})
|
||||
# define search terms for locating connected nodes:
|
||||
search_terms = [{
|
||||
'n': row['n']-1,
|
||||
'node_type':row['node_type'],
|
||||
'node':row['node']
|
||||
}]
|
||||
# iterate through degrees of seperation till origin is reached:
|
||||
for j in range(row['n']-1,-1,-1):
|
||||
for term in search_terms:
|
||||
if term['n'] == j:
|
||||
if term['node_type'] == "Address":
|
||||
select_rows = list(filter(lambda d: d.get('address') == term['node'] and d.get('n') == j, self.addresses))
|
||||
for k, select_row in enumerate(select_rows):
|
||||
if select_row['n'] == 0:
|
||||
origin = {
|
||||
'hop': j,
|
||||
"type": "Address",
|
||||
"id": select_row['address'],
|
||||
"node": select_row['address'],
|
||||
"node_type": "",
|
||||
"link_id": ""
|
||||
}
|
||||
if origin not in path:
|
||||
path.insert(0, origin)
|
||||
break
|
||||
else:
|
||||
item = {
|
||||
'hop': j,
|
||||
"type": "Address",
|
||||
"id": select_row['address'],
|
||||
"node": select_row['address'],
|
||||
"node_type": select_row['link_type'],
|
||||
"link_id": select_row['node']
|
||||
}
|
||||
if item not in path:
|
||||
path.insert(0, item)
|
||||
search_terms.append({
|
||||
'n': j-1,
|
||||
'node_type':select_row['node_type'],
|
||||
'node':select_row['node']
|
||||
})
|
||||
elif term['node_type'] == "Company":
|
||||
select_rows = list(filter(lambda d: d.get('company_id') == term['node'] and d.get('n') == j, self.company_ids))
|
||||
for l, select_row in enumerate(select_rows):
|
||||
if select_row['n'] == 0:
|
||||
origin = {
|
||||
'hop': j,
|
||||
"type": "Company",
|
||||
"id": select_row['company_id'],
|
||||
"node": select_row['company_name'],
|
||||
"node_type": "",
|
||||
"link_id": ""
|
||||
}
|
||||
if origin not in path:
|
||||
path.insert(0, origin)
|
||||
break
|
||||
else:
|
||||
item = {
|
||||
'hop': j,
|
||||
"type": "Company",
|
||||
"id": select_row['company_id'],
|
||||
"node": select_row['company_name'],
|
||||
"node_type": select_row['link_type'],
|
||||
"link_id": select_row['node']
|
||||
}
|
||||
if item not in path:
|
||||
path.insert(0, item)
|
||||
search_terms.append({
|
||||
'n': j-1,
|
||||
'node_type':select_row['node_type'],
|
||||
'node':select_row['node']
|
||||
})
|
||||
elif term['node_type'] == "Person":
|
||||
select_rows = list(filter(lambda d: d.get('officer_id') == term['node'] and d.get('n') == j, self.officer_ids))
|
||||
for m, select_row in enumerate(select_rows):
|
||||
if select_row['link_type'] == 0:
|
||||
origin = {
|
||||
'hop': j,
|
||||
"type": "Person",
|
||||
"id": select_row["officer_id"],
|
||||
"node": select_row['name'],
|
||||
"node_type": "",
|
||||
"link_id": ""
|
||||
}
|
||||
if origin not in path:
|
||||
path.insert(0, origin)
|
||||
break
|
||||
else:
|
||||
item = {
|
||||
'hop': j,
|
||||
"type": "Person",
|
||||
"id": select_row["officer_id"],
|
||||
"node": str(select_row['name']),
|
||||
"node_type": str(select_row['link_type']),
|
||||
"link_id": select_row['node']
|
||||
}
|
||||
if item not in path:
|
||||
path.insert(0, item)
|
||||
search_terms.append({
|
||||
'n': j-1,
|
||||
'node_type':select_row['node_type'],
|
||||
'node':select_row['node']
|
||||
})
|
||||
else:
|
||||
print(f"{row['node_type']} is invalid node_type")
|
||||
break
|
||||
sorted_path = sorted(path, key=lambda d: d['hop'])
|
||||
# add letter correspondance for readability
|
||||
for i in range(len(sorted_path)-1,-1,-1):
|
||||
search_term = sorted_path[i]['link_id']
|
||||
link_indices = []
|
||||
for j,item in enumerate(sorted_path):
|
||||
if item['id'] == search_term:
|
||||
link_indices.append(alc[j])
|
||||
sorted_path[i]["link"] = ','.join(link_indices)
|
||||
sorted_path[i]["node_index"] = alc[i]
|
||||
return sorted_path
|
||||
end_node = dict(self.graph[company_id])
|
||||
if not end_node['arcs']:
|
||||
# start_node selected
|
||||
end_node.update({
|
||||
'id': company_id,
|
||||
'link_type': '',
|
||||
'link': ''
|
||||
})
|
||||
path.append(dict((k, end_node[k]) for k in ('title', 'depth', 'node_type', 'id', 'link', 'link_type')))
|
||||
else:
|
||||
# work back from the end node to the start node
|
||||
for arc in end_node['arcs']:
|
||||
connection = dict((k, end_node[k]) for k in ('title', 'depth', 'node_type'))
|
||||
connection.update({
|
||||
'id': company_id,
|
||||
'link_type': arc['arc_type'],
|
||||
'link': arc['start_node']
|
||||
})
|
||||
path.append(connection)
|
||||
for connection in path:
|
||||
id = connection['link']
|
||||
node = dict(self.graph[id])
|
||||
if node['arcs']:
|
||||
for arc in node['arcs']:
|
||||
connection = dict((k, node[k]) for k in ('title', 'depth', 'node_type'))
|
||||
connection.update({
|
||||
'id': id,
|
||||
'link_type': arc['arc_type'],
|
||||
'link': arc['start_node']
|
||||
})
|
||||
if connection not in path:
|
||||
path.append(connection)
|
||||
else:
|
||||
start_node = dict((k, node[k]) for k in ('title', 'depth', 'node_type'))
|
||||
start_node.update({
|
||||
'id': id,
|
||||
'link_type': '',
|
||||
'link': ''
|
||||
})
|
||||
path.append(start_node)
|
||||
break
|
||||
path.reverse()
|
||||
path = sugartrail.processing.condense_path(path)
|
||||
path = sugartrail.processing.asciiify_path(path)
|
||||
return path
|
||||
|
||||
def perform_hop(self, hops, company_data=None):
|
||||
"""Gets companies, officers and addresses within n-degrees of seperation
|
||||
from current nodes, where n is the number of hops."""
|
||||
hop_history = []
|
||||
for hop in range(hops):
|
||||
# select the nodes for which the method will retrieve other nodes
|
||||
# 1-degree of seperation from:
|
||||
selected_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') == self.n, self.addresses))]
|
||||
selected_companies = [company['company_id'] for company in list(filter(lambda d: d.get('n') == self.n, self.company_ids))]
|
||||
selected_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') == self.n, self.officer_ids))]
|
||||
# retrieve addresses, companies and officers at edge of network
|
||||
selected_addresses, selected_companies, selected_officers = [], [], []
|
||||
for k in self.graph.keys():
|
||||
if self.graph[k]['depth'] == self.n:
|
||||
if self.graph[k]['node_type'] == 'Address':
|
||||
selected_addresses.append(k)
|
||||
elif self.graph[k]['node_type'] == 'Person':
|
||||
selected_officers.append(k)
|
||||
elif self.graph[k]['node_type'] == 'Company':
|
||||
selected_companies.append(k)
|
||||
if not selected_addresses and not selected_companies and not selected_officers:
|
||||
print("Edge of network reached.")
|
||||
break
|
||||
# get new addresses, companies and officers connected to selected
|
||||
else:
|
||||
for i,address in enumerate(selected_addresses):
|
||||
# in-case method was run previously and failed to complete,
|
||||
# check if address was previously processed:
|
||||
if address not in self.processed_addresses:
|
||||
self.hop.search_address(self, address, company_data)
|
||||
self.processed_addresses.append(address)
|
||||
@@ -408,8 +400,6 @@ class Network:
|
||||
print("Hop number: " + str(hop+1))
|
||||
print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.")
|
||||
for j,company in enumerate(selected_companies):
|
||||
# in-case method was run previously and failed to complete,
|
||||
# check if company was previously processed:
|
||||
if company not in self.processed_companies:
|
||||
self.hop.search_company_id(self,company)
|
||||
self.processed_companies.append(company)
|
||||
@@ -418,8 +408,6 @@ class Network:
|
||||
print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.")
|
||||
print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies.")
|
||||
for k,officer in enumerate(selected_officers):
|
||||
# in-case method was run previously and failed to complete,
|
||||
# check if officer was previously processed:
|
||||
if officer not in self.processed_officers:
|
||||
self.hop.search_officer_id(self,officer)
|
||||
self.processed_officers.append(officer)
|
||||
@@ -428,260 +416,8 @@ class Network:
|
||||
print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.")
|
||||
print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.")
|
||||
print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.")
|
||||
self.officer_ids = [i for n, i in enumerate(self.officer_ids) if i not in self.officer_ids[n + 1:]]
|
||||
self.company_ids = [i for n, i in enumerate(self.company_ids) if i not in self.company_ids[n + 1:]]
|
||||
self.maxsize_entities = [i for n, i in enumerate(self.maxsize_entities) if i not in self.maxsize_entities[n + 1:]]
|
||||
self.addresses = [i for n, i in enumerate(self.addresses) if i not in self.addresses[n + 1:]]
|
||||
self.address_history = [i for n, i in enumerate(self.address_history) if i not in self.address_history[n + 1:]]
|
||||
self.companies = [i for n, i in enumerate(self.companies) if i not in self.companies[n + 1:]]
|
||||
self.processed_officers = []
|
||||
self.processed_companies = []
|
||||
self.processed_addresses = []
|
||||
self.processed_officers, self.processed_companies, self.processed_addresses = [],[],[]
|
||||
self.n += 1
|
||||
hop_history.append(self.hop.__dict__)
|
||||
self.hop_history.extend(hop_history)
|
||||
|
||||
class Hop:
|
||||
"""Class attributes store the criteria for each hop. Class contains
|
||||
methods for getting officers, addresses and companies using the
|
||||
criteria."""
|
||||
def __init__(self):
|
||||
self.get_company_officers = True
|
||||
self.get_company_address_history = True
|
||||
self.get_psc_correspondance_address = True
|
||||
self.get_officer_appointments = True
|
||||
self.officer_appointments_maxsize = 50
|
||||
self.get_officer_correspondance_address = True
|
||||
self.get_officer_duplicates = True
|
||||
self.officer_duplicates_maxsize = None
|
||||
self.get_officers_at_address = True
|
||||
self.officers_at_address_maxsize = 50
|
||||
self.get_companies_at_address = True
|
||||
self.companies_at_address_maxsize = 50
|
||||
|
||||
def search_company_id(self, network, company_id):
|
||||
"""Gets officers and addresses connected to input company
|
||||
(company_id)."""
|
||||
officers = []
|
||||
new_addresses = []
|
||||
new_officers = []
|
||||
if self.get_company_officers:
|
||||
# get officers at company
|
||||
officers = sugartrail.api.get_company_officers(company_id)
|
||||
if officers:
|
||||
if 'items' in officers:
|
||||
officers = officers['items']
|
||||
# process officer results
|
||||
network.node_type = "Company"
|
||||
network.node = company_id
|
||||
# find addresses and officers already added to the network
|
||||
lower_n_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') < network.n+1, network.addresses))]
|
||||
lower_n_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') < network.n+1, network.officer_ids))]
|
||||
if officers:
|
||||
for officer in officers:
|
||||
if officer['links']['officer']['appointments'].split('/')[2] not in lower_n_officers:
|
||||
network.link_type = "Officer"
|
||||
new_officer = {
|
||||
'officer_id': str(officer['links']['officer']['appointments'].split('/')[2]),
|
||||
'name': sugartrail.processing.normalise_name(officer['name']),
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_officer not in new_officers:
|
||||
new_officers.append(new_officer)
|
||||
if self.get_psc_correspondance_address:
|
||||
# get address for company pscs
|
||||
psc = sugartrail.api.get_psc(company_id)
|
||||
if psc:
|
||||
if 'items' in psc:
|
||||
for person in psc['items']:
|
||||
if "address" in person:
|
||||
network.link_type = "Person of Significant Control Address"
|
||||
if sugartrail.processing.normalise_address(person['address']) not in lower_n_addresses:
|
||||
new_address = {
|
||||
'address': sugartrail.processing.normalise_address(person['address']),
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_address not in new_addresses:
|
||||
new_addresses.append(new_address)
|
||||
if self.get_company_address_history:
|
||||
# get company address history
|
||||
address_history = sugartrail.processing.build_address_history(company_id)
|
||||
network.address_history.extend(address_history)
|
||||
for address in address_history:
|
||||
network.link_type = "Historic Address"
|
||||
if 'address' in address:
|
||||
if address['address'] not in lower_n_addresses:
|
||||
new_address = {
|
||||
'address': address['address'],
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_address not in new_addresses:
|
||||
new_addresses.append(dict({
|
||||
'address': address['address'],
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}))
|
||||
network.addresses.extend(new_addresses)
|
||||
network.officer_ids.extend(new_officers)
|
||||
|
||||
def search_officer_id(self, network, officer_id):
|
||||
"""Gets officers, companies and addresses connected to input officer
|
||||
(officer_id)."""
|
||||
new_addresses = []
|
||||
new_companies = []
|
||||
new_officers = []
|
||||
network.node_type = "Person"
|
||||
network.node = officer_id
|
||||
lower_n_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') < network.n+1, network.addresses))]
|
||||
lower_n_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') < network.n+1, network.officer_ids))]
|
||||
lower_n_companies = [company['company_id'] for company in list(filter(lambda d: d.get('n') < network.n+1, network.company_ids))]
|
||||
appointments = sugartrail.api.get_appointments(officer_id)
|
||||
if appointments:
|
||||
if self.officer_appointments_maxsize == None or len(appointments['items']) < int(self.officer_appointments_maxsize or 0):
|
||||
for appointment in appointments['items']:
|
||||
if sugartrail.processing.normalise_address(appointment['address']) not in lower_n_addresses:
|
||||
network.link_type = "Appointment Address"
|
||||
new_address = {
|
||||
'address': sugartrail.processing.normalise_address(appointment['address']),
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_address not in new_addresses:
|
||||
new_addresses.append(new_address)
|
||||
if appointment['appointed_to']['company_number'] not in lower_n_companies:
|
||||
network.link_type = "Appointment"
|
||||
new_company = {
|
||||
'company_id': appointment['appointed_to']['company_number'],
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_company not in new_companies:
|
||||
new_companies.append(new_company)
|
||||
elif len(appointments['items']) > int(self.officer_appointments_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':officer_id,
|
||||
'type': 'Officer',
|
||||
'maxsize_type': 'Appointments',
|
||||
'size': len(appointments['items'])
|
||||
}))
|
||||
if self.get_officer_correspondance_address:
|
||||
correspondance_address = sugartrail.api.get_correspondance_address(officer_id)
|
||||
if correspondance_address:
|
||||
if sugartrail.processing.normalise_address(correspondance_address['items'][0]['address']) not in lower_n_addresses:
|
||||
network.link_type = "Officer Corresponance Address"
|
||||
new_address = {
|
||||
'address': sugartrail.processing.normalise_address(correspondance_address['items'][0]['address']),
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_address not in new_addresses:
|
||||
new_addresses.append(new_address)
|
||||
if self.get_officer_duplicates:
|
||||
duplicate_officers = sugartrail.api.get_duplicate_officers(officer_id)
|
||||
if duplicate_officers:
|
||||
if self.officer_duplicates_maxsize == None or len(duplicate_officers) < int(self.officer_duplicates_maxsize or 0):
|
||||
for duplicate in duplicate_officers:
|
||||
network.link_type = "Duplicate Officer"
|
||||
if duplicate['links']['self'].split('/')[2] not in lower_n_officers:
|
||||
new_officer = {
|
||||
'officer_id': duplicate['links']['self'].split('/')[2],
|
||||
'name': duplicate['title'], 'n':network.n+1,
|
||||
'link_type': network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_officer not in new_officers:
|
||||
new_officers.append(new_officer)
|
||||
elif len(duplicate_officers) > int(self.officer_duplicates_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':officer_id,
|
||||
'type': 'Officer',
|
||||
'maxsize_type': 'Duplicates',
|
||||
'size': len(duplicate_officers)
|
||||
}))
|
||||
network.addresses.extend(new_addresses)
|
||||
network.officer_ids.extend(new_officers)
|
||||
network.company_ids.extend(new_companies)
|
||||
|
||||
def search_address(self, network, address, company_data):
|
||||
"""Gets officers, companies and addresses connected to input officer
|
||||
(officer_id)."""
|
||||
new_companies = []
|
||||
new_officers = []
|
||||
network.node_type = "Address"
|
||||
network.node = address
|
||||
lower_n_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') < network.n+1, network.addresses))]
|
||||
lower_n_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') < network.n+1, network.officer_ids))]
|
||||
lower_n_companies = [company['company_id'] for company in list(filter(lambda d: d.get('n') < network.n+1, network.company_ids))]
|
||||
if self.get_companies_at_address:
|
||||
companies = {}
|
||||
if company_data is not None:
|
||||
companies['items'] = sugartrail.processing.get_companies_from_address_database(address, company_data)
|
||||
else:
|
||||
companies = sugartrail.api.get_companies_at_address(address)
|
||||
if companies:
|
||||
if 'items' in companies:
|
||||
if self.companies_at_address_maxsize == None or len(companies['items']) < int(self.companies_at_address_maxsize or 0):
|
||||
for company in companies['items']:
|
||||
network.link_type = "Company at Address"
|
||||
if company['company_number'] not in lower_n_companies:
|
||||
new_company = {
|
||||
'company_id': company['company_number'],
|
||||
'n':network.n+1,
|
||||
'link_type':network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_company not in new_companies:
|
||||
new_companies.append(new_company)
|
||||
elif len(companies['items']) > int(self.companies_at_address_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':address,
|
||||
'type': 'Address',
|
||||
'maxsize_type': 'Companies',
|
||||
'size': len(companies['items'])
|
||||
}))
|
||||
if self.get_officers_at_address:
|
||||
officers = sugartrail.api.get_officers_at_address(address)
|
||||
if officers:
|
||||
if self.officers_at_address_maxsize == None or len(officers) < int(self.officers_at_address_maxsize or 0):
|
||||
for officer in officers:
|
||||
if 'links' and 'title' in officer:
|
||||
network.link_type = "Officer at Address"
|
||||
if officer['links']['self'].split('/')[2] not in lower_n_officers:
|
||||
new_officer = {
|
||||
'officer_id': officer['links']['self'].split('/')[2],
|
||||
'name': officer['title'],
|
||||
'n':network.n+1,
|
||||
'link_type': network.link_type,
|
||||
'node_type': network.node_type,
|
||||
'node': network.node
|
||||
}
|
||||
if new_officer not in new_officers:
|
||||
new_officers.append(new_officer)
|
||||
elif len(officers) > int(self.officers_at_address_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':address,
|
||||
'type': 'Address',
|
||||
'maxsize_type': 'Officers',
|
||||
'size': len(officers)
|
||||
}))
|
||||
network.officer_ids.extend(new_officers)
|
||||
network.company_ids.extend(new_companies)
|
||||
|
||||
221
sugartrail/hop.py
Normal file
221
sugartrail/hop.py
Normal file
@@ -0,0 +1,221 @@
|
||||
import sugartrail
|
||||
|
||||
class Hop:
|
||||
"""Class attributes store the criteria for each hop. Class contains
|
||||
methods for getting officers, addresses and companies using the
|
||||
criteria."""
|
||||
def __init__(self):
|
||||
self.get_company_officers = True
|
||||
self.get_company_address_history = True
|
||||
self.get_psc_correspondance_address = True
|
||||
self.get_officer_appointments = True
|
||||
self.officer_appointments_maxsize = 50
|
||||
self.get_officer_correspondance_address = True
|
||||
self.get_officer_duplicates = True
|
||||
self.officer_duplicates_maxsize = None
|
||||
self.get_officers_at_address = True
|
||||
self.officers_at_address_maxsize = 50
|
||||
self.get_companies_at_address = True
|
||||
self.companies_at_address_maxsize = 50
|
||||
|
||||
def search_company_id(self, network, company_id):
|
||||
"""Gets officers and addresses connected to input company
|
||||
(company_id)."""
|
||||
officers = []
|
||||
if self.get_company_officers:
|
||||
officers = sugartrail.api.get_company_officers(company_id)
|
||||
if officers:
|
||||
if 'items' in officers:
|
||||
officers = officers['items']
|
||||
if officers:
|
||||
for officer in officers:
|
||||
new_officer_id = str(officer['links']['officer']['appointments'].split('/')[2])
|
||||
if new_officer_id not in network.graph:
|
||||
network.graph[new_officer_id] = {
|
||||
'depth': network.n+1,
|
||||
'title': sugartrail.processing.normalise_name(officer['name']),
|
||||
'node_type': "Person",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Officer",
|
||||
'start_node': company_id
|
||||
}
|
||||
if arc not in network.graph[new_officer_id]['arcs'] and network.graph[new_officer_id]['depth'] == network.n+1:
|
||||
network.graph[new_officer_id]['arcs'].append(arc)
|
||||
if self.get_psc_correspondance_address:
|
||||
# get address for company pscs
|
||||
psc = sugartrail.api.get_psc(company_id)
|
||||
if psc:
|
||||
if 'items' in psc:
|
||||
for person in psc['items']:
|
||||
if "address" in person:
|
||||
new_address = sugartrail.processing.normalise_address(person['address'])
|
||||
if new_address not in network.graph:
|
||||
network.graph[new_address] = {
|
||||
'depth': network.n+1,
|
||||
'title': new_address,
|
||||
'node_type': "Address",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Person of Significant Control Address",
|
||||
'start_node': company_id
|
||||
}
|
||||
if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1:
|
||||
network.graph[new_address]['arcs'].append(arc)
|
||||
if self.get_company_address_history:
|
||||
# get company address history
|
||||
address_history = sugartrail.processing.build_address_history(company_id)
|
||||
# network.address_history.extend(address_history)
|
||||
for address in address_history:
|
||||
if 'address' in address:
|
||||
network.address_history.append(address)
|
||||
new_address = address['address']
|
||||
if new_address not in network.graph:
|
||||
network.graph[new_address] = {
|
||||
'depth': network.n+1,
|
||||
'title': new_address,
|
||||
'node_type': "Address",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Historic Address",
|
||||
'start_node': company_id
|
||||
}
|
||||
if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1:
|
||||
network.graph[new_address]['arcs'].append(arc)
|
||||
|
||||
def search_officer_id(self, network, officer_id):
|
||||
"""Gets officers, companies and addresses connected to input officer
|
||||
(officer_id)."""
|
||||
appointments = sugartrail.api.get_appointments(officer_id)
|
||||
if appointments:
|
||||
if self.officer_appointments_maxsize == None or len(appointments['items']) < int(self.officer_appointments_maxsize or 0):
|
||||
for appointment in appointments['items']:
|
||||
new_company = appointment['appointed_to']['company_number']
|
||||
if new_company not in network.graph:
|
||||
network.graph[new_company] = {
|
||||
'depth': network.n+1,
|
||||
'title': appointment['appointed_to']['company_name'],
|
||||
'node_type': "Company",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Appointment",
|
||||
'start_node': officer_id
|
||||
}
|
||||
if arc not in network.graph[new_company]['arcs'] and network.graph[new_company]['depth'] == network.n+1:
|
||||
network.graph[new_company]['arcs'].append(arc)
|
||||
elif len(appointments['items']) > int(self.officer_appointments_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':officer_id,
|
||||
'type': 'Officer',
|
||||
'maxsize_type': 'Appointments',
|
||||
'size': len(appointments['items'])
|
||||
}))
|
||||
if self.get_officer_correspondance_address:
|
||||
correspondance_address = sugartrail.api.get_correspondance_address(officer_id)
|
||||
if correspondance_address:
|
||||
new_address = sugartrail.processing.normalise_address(correspondance_address['items'][0]['address'])
|
||||
if new_address not in network.graph:
|
||||
network.graph[new_address] = {
|
||||
'depth': network.n+1,
|
||||
'title': new_address,
|
||||
'node_type': "Address",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Officer Corresponance Address",
|
||||
'start_node': officer_id
|
||||
}
|
||||
if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1:
|
||||
network.graph[new_address]['arcs'].append(arc)
|
||||
if self.get_officer_duplicates:
|
||||
duplicate_officers = sugartrail.api.get_duplicate_officers(officer_id)
|
||||
if duplicate_officers:
|
||||
if self.officer_duplicates_maxsize == None or len(duplicate_officers) < int(self.officer_duplicates_maxsize or 0):
|
||||
for duplicate in duplicate_officers:
|
||||
new_officer = duplicate['links']['self'].split('/')[2]
|
||||
if new_officer not in network.graph:
|
||||
network.graph[new_officer] = {
|
||||
'depth': network.n+1,
|
||||
'title': duplicate['title'],
|
||||
'node_type': "Person",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Duplicate Officer",
|
||||
'start_node': officer_id
|
||||
}
|
||||
if arc not in network.graph[new_officer]['arcs'] and network.graph[new_officer]['depth'] == network.n+1:
|
||||
network.graph[new_officer]['arcs'].append(arc)
|
||||
elif len(duplicate_officers) > int(self.officer_duplicates_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':officer_id,
|
||||
'type': 'Officer',
|
||||
'maxsize_type': 'Duplicates',
|
||||
'size': len(duplicate_officers)
|
||||
}))
|
||||
|
||||
def search_address(self, network, address, company_data):
|
||||
"""Gets officers, companies and addresses connected to input officer
|
||||
(officer_id)."""
|
||||
if self.get_companies_at_address:
|
||||
companies = {}
|
||||
if company_data is not None:
|
||||
companies['items'] = sugartrail.processing.get_companies_from_address_database(address, company_data)
|
||||
else:
|
||||
companies = sugartrail.api.get_companies_at_address(address)
|
||||
if companies:
|
||||
if 'items' in companies:
|
||||
if self.companies_at_address_maxsize == None or len(companies['items']) < int(self.companies_at_address_maxsize or 0):
|
||||
for company in companies['items']:
|
||||
new_company = company['company_number']
|
||||
if new_company not in network.graph:
|
||||
network.graph[new_company] = {
|
||||
'depth': network.n+1,
|
||||
'title': company['company_name'],
|
||||
'node_type': "Company",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Company at Address",
|
||||
'start_node': address
|
||||
}
|
||||
if arc not in network.graph[new_company]['arcs'] and network.graph[new_company]['depth'] == network.n+1:
|
||||
network.graph[new_company]['arcs'].append(arc)
|
||||
elif len(companies['items']) > int(self.companies_at_address_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':address,
|
||||
'type': 'Address',
|
||||
'maxsize_type': 'Companies',
|
||||
'size': len(companies['items'])
|
||||
}))
|
||||
if self.get_officers_at_address:
|
||||
officers = sugartrail.api.get_officers_at_address(address)
|
||||
if officers:
|
||||
if self.officers_at_address_maxsize == None or len(officers) < int(self.officers_at_address_maxsize or 0):
|
||||
for officer in officers:
|
||||
if 'links' and 'title' in officer:
|
||||
new_officer = officer['links']['self'].split('/')[2]
|
||||
if new_officer not in network.graph:
|
||||
network.graph[new_officer] = {
|
||||
'depth': network.n+1,
|
||||
'title': officer['title'],
|
||||
'node_type': "Person",
|
||||
'arcs': []
|
||||
}
|
||||
arc = {
|
||||
'arc_type': "Officer at Address",
|
||||
'start_node': address
|
||||
}
|
||||
if arc not in network.graph[new_officer]['arcs'] and network.graph[new_officer]['depth'] == network.n+1:
|
||||
network.graph[new_officer]['arcs'].append(arc)
|
||||
elif len(officers) > int(self.officers_at_address_maxsize):
|
||||
network.maxsize_entities.append(dict({
|
||||
'node':address,
|
||||
'type': 'Address',
|
||||
'maxsize_type': 'Officers',
|
||||
'size': len(officers)
|
||||
}))
|
||||
@@ -64,52 +64,53 @@ def get_marker_data(network,address_trail, origin_trail, path_table):
|
||||
if row['lat'] and row['lon']:
|
||||
marker_color = "green"
|
||||
# locate company at historic address
|
||||
company = list(filter(lambda d: d.get('company_number') == row['company_number'], network.companies))[0]
|
||||
company_name = company['company_name']
|
||||
company_status = company['company_status']
|
||||
if company_status == "active":
|
||||
if row['end_date']:
|
||||
marker_color = "red"
|
||||
else:
|
||||
marker_color = "black"
|
||||
address = row['address']
|
||||
# find path from company to origin
|
||||
path = network.find_path(str(row['company_number']))
|
||||
locations_from_origin = locations_from_origin_path(path, network)
|
||||
message = HTML()
|
||||
message.value = str(company_name) + "<hr>" + str(address)
|
||||
icon = AwesomeIcon(
|
||||
marker_color=marker_color
|
||||
)
|
||||
# find historic addresses path for company
|
||||
address_path = get_address_path(network,str(row['company_number']))
|
||||
marker = Marker(
|
||||
icon=icon,
|
||||
opacity=1,
|
||||
location=(row['lat'],
|
||||
row['lon']),
|
||||
draggable=False,
|
||||
popup=message,
|
||||
title="Address"
|
||||
company = list(filter(lambda d: d.get('company_number') == row['company_number'], network.company_records))
|
||||
if company:
|
||||
company_name = company[0]['company_name']
|
||||
company_status = company[0]['company_status']
|
||||
if company_status == "active":
|
||||
if row['end_date']:
|
||||
marker_color = "red"
|
||||
else:
|
||||
marker_color = "black"
|
||||
address = row['address']
|
||||
# find path from company to origin
|
||||
path = network.find_path(str(row['company_number']))
|
||||
locations_from_origin = locations_from_origin_path(path, network)
|
||||
message = HTML()
|
||||
message.value = str(company_name) + "<hr>" + str(address)
|
||||
icon = AwesomeIcon(
|
||||
marker_color=marker_color
|
||||
)
|
||||
# attach on click behavoir for marker
|
||||
marker.on_click(functools.partial(
|
||||
on_button_clicked,
|
||||
address_path=address_path,
|
||||
address_trail=address_trail,
|
||||
path_table=path_table,
|
||||
origin_trail=origin_trail,
|
||||
path=path, location=(row['lat'], row['lon']),
|
||||
locations_from_origin = locations_from_origin
|
||||
))
|
||||
markers.append(marker)
|
||||
# find historic addresses path for company
|
||||
address_path = get_address_path(network,str(row['company_number']))
|
||||
marker = Marker(
|
||||
icon=icon,
|
||||
opacity=1,
|
||||
location=(row['lat'],
|
||||
row['lon']),
|
||||
draggable=False,
|
||||
popup=message,
|
||||
title="Address"
|
||||
)
|
||||
# attach on click behavoir for marker
|
||||
marker.on_click(functools.partial(
|
||||
on_button_clicked,
|
||||
address_path=address_path,
|
||||
address_trail=address_trail,
|
||||
path_table=path_table,
|
||||
origin_trail=origin_trail,
|
||||
path=path, location=(row['lat'], row['lon']),
|
||||
locations_from_origin = locations_from_origin
|
||||
))
|
||||
markers.append(marker)
|
||||
return markers
|
||||
|
||||
def locations_from_origin_path(path, network):
|
||||
"""Returns list of addresses found within origin path."""
|
||||
locations = []
|
||||
for node in path:
|
||||
if node['type'] == 'Company':
|
||||
if node['node_type'] == 'Company':
|
||||
# finds location for company node
|
||||
company_address_history = list(filter(lambda d: d.get('company_number') == node['id'], network.address_history))
|
||||
company_address_history_sorted = sorted(company_address_history, key=lambda d: d['start_date'], reverse=True)
|
||||
@@ -125,14 +126,13 @@ def locations_from_origin_path(path, network):
|
||||
pass
|
||||
else:
|
||||
locations.append([lat,lon])
|
||||
elif node['type'] == 'Address':
|
||||
address_row = list(filter(lambda d: d.get('address') == node['node'], network.addresses))[0]
|
||||
lat = address_row['lat']
|
||||
lon = address_row['lon']
|
||||
if not lat or not lon:
|
||||
pass
|
||||
else:
|
||||
elif node['node_type'] == 'Address':
|
||||
if 'lat' in network.graph[node['id']]:
|
||||
lat = network.graph[node['id']]['lat']
|
||||
lon = network.graph[node['id']]['lon']
|
||||
locations.append([lat,lon])
|
||||
else:
|
||||
pass
|
||||
return locations
|
||||
|
||||
def get_address_path(network, company_id):
|
||||
@@ -158,12 +158,12 @@ def on_button_clicked(address_path, path, location, address_trail, path_table, o
|
||||
def html_table_generator(path):
|
||||
"""Generates table for displaying origin path data."""
|
||||
table_style = '<style>table {font-family: arial, sans-serif;border-collapse: collapse;}td, th {border: 1px solid #dddddd;text-align: left;padding: 8px;}tr:nth-child(even) {background-color: #dddddd;}</style>'
|
||||
headers = ['Node Index', 'Node', 'Hop', 'Node Type', 'Link']
|
||||
headers = ['Node Index', 'Title', 'Depth', 'Link Type', 'Link']
|
||||
headers_row = ""
|
||||
for header in headers:
|
||||
headers_row += '<th>' + header + '</th>'
|
||||
nodes = ""
|
||||
for i, node in enumerate(path):
|
||||
nodes += '<tr><td>' + node['node_index'] + '</td><td>' + str(node['node']) + '</td><td>' + str(node['hop']) + '</td><td>' + str(node['node_type']) + '</td><td>' + str(node['link']) + '</td></tr>'
|
||||
nodes += '<tr><td>' + node['node_index'] + '</td><td>' + str(node['title']) + '</td><td>' + str(node['depth']) + '</td><td>' + str(node['link_type']) + '</td><td>' + str(node['link']) + '</td></tr>'
|
||||
table_html = table_style + '<table><tr>' + headers_row + '</tr>' + nodes + '</table>'
|
||||
return table_html
|
||||
|
||||
@@ -5,6 +5,7 @@ import random
|
||||
import urllib
|
||||
import regex as re
|
||||
import collections
|
||||
from string import ascii_letters as alc
|
||||
|
||||
def flatten(d, parent_key='', sep='.'):
|
||||
"""Flatten nested dictionary."""
|
||||
@@ -25,6 +26,24 @@ def infer_postcode(address_string):
|
||||
else:
|
||||
return
|
||||
|
||||
def condense_path(path):
|
||||
condensed_path = []
|
||||
for i, item in enumerate(path):
|
||||
item_subset = dict((k, item[k]) for k in ('title', 'depth', 'node_type', 'id', 'link_type'))
|
||||
matching_items = [item_whole for item_whole in path if item_subset.items() <= item_whole.items()]
|
||||
item_subset['link'] = []
|
||||
for item_whole in matching_items:
|
||||
item_subset['link'].append(item_whole['link'])
|
||||
if item_subset not in condensed_path:
|
||||
condensed_path.append(item_subset)
|
||||
return condensed_path
|
||||
|
||||
def asciiify_path(path):
|
||||
for i, item in enumerate(path):
|
||||
path[i]['node_index'] = int(1+i/51)*alc[i%51]
|
||||
path[i]['link'] = ", ".join([d['node_index'] for d in path if d['id'] in path[i]['link']])
|
||||
return path
|
||||
|
||||
def get_companies_from_address_database(address, company_data):
|
||||
"""Searches input dataframe (company_data) for companies at input address
|
||||
(address) and returns list of dicts."""
|
||||
@@ -100,50 +119,51 @@ def build_address_history(company_id):
|
||||
address_changes = api.get_address_changes(company_id)
|
||||
address_keys = ('start_date','end_date','address')
|
||||
if address_changes:
|
||||
if address_changes['items']:
|
||||
# attempt to retrieve any missing items within address changes
|
||||
address_changes = process_address_changes(address_changes)
|
||||
addresses = []
|
||||
entry = {}
|
||||
entry["company_number"] = str(company_id)
|
||||
entry["lat"] = ""
|
||||
entry["lon"] = ""
|
||||
entry["address"] = str(normalise_address(company_info_subset['registered_office_address']))
|
||||
entry["start_date"] = str(address_changes['items'][0]['date'])
|
||||
if 'date_of_cessation' in company_info_subset:
|
||||
entry["end_date"] = str(company_info_subset['date_of_cessation'])
|
||||
else:
|
||||
entry["end_date"] = None
|
||||
addresses.append(entry)
|
||||
for i,change in enumerate(address_changes['items']):
|
||||
if 'items' in address_changes:
|
||||
if address_changes['items']:
|
||||
# attempt to retrieve any missing items within address changes
|
||||
address_changes = process_address_changes(address_changes)
|
||||
addresses = []
|
||||
entry = {}
|
||||
entry["company_number"] = str(company_id)
|
||||
entry["lat"] = ""
|
||||
entry["lon"] = ""
|
||||
entry["company_number"] = str(company_id)
|
||||
if 'old_address' in change['description_values']:
|
||||
entry["address"] = change['description_values']['old_address']
|
||||
entry["address"] = str(normalise_address(company_info_subset['registered_office_address']))
|
||||
entry["start_date"] = str(address_changes['items'][0]['date'])
|
||||
if 'date_of_cessation' in company_info_subset:
|
||||
entry["end_date"] = str(company_info_subset['date_of_cessation'])
|
||||
else:
|
||||
entry["address"] = ""
|
||||
if i+1 < len(address_changes['items']):
|
||||
entry["start_date"] = str(address_changes['items'][i+1]['date'])
|
||||
else:
|
||||
entry["start_date"] = company_info_subset['date_of_creation']
|
||||
entry["end_date"] = str(change['date'])
|
||||
entry["end_date"] = None
|
||||
addresses.append(entry)
|
||||
return addresses
|
||||
else:
|
||||
address_history = []
|
||||
entry = {}
|
||||
for k, key in enumerate(["date_of_creation","date_of_cessation","registered_office_address"]):
|
||||
if key in company_info:
|
||||
entry[address_keys[k]] = company_info[key]
|
||||
else:
|
||||
entry[address_keys[k]] = None
|
||||
entry["company_number"] = str(company_id)
|
||||
entry['address'] = normalise_address(entry['address'])
|
||||
entry["lat"] = ""
|
||||
entry["lon"] = ""
|
||||
return [entry]
|
||||
for i,change in enumerate(address_changes['items']):
|
||||
entry = {}
|
||||
entry["lat"] = ""
|
||||
entry["lon"] = ""
|
||||
entry["company_number"] = str(company_id)
|
||||
if 'old_address' in change['description_values']:
|
||||
entry["address"] = change['description_values']['old_address']
|
||||
else:
|
||||
entry["address"] = ""
|
||||
if i+1 < len(address_changes['items']):
|
||||
entry["start_date"] = str(address_changes['items'][i+1]['date'])
|
||||
else:
|
||||
entry["start_date"] = company_info_subset['date_of_creation']
|
||||
entry["end_date"] = str(change['date'])
|
||||
addresses.append(entry)
|
||||
return addresses
|
||||
else:
|
||||
address_history = []
|
||||
entry = {}
|
||||
for k, key in enumerate(["date_of_creation","date_of_cessation","registered_office_address"]):
|
||||
if key in company_info:
|
||||
entry[address_keys[k]] = company_info[key]
|
||||
else:
|
||||
entry[address_keys[k]] = None
|
||||
entry["company_number"] = str(company_id)
|
||||
entry['address'] = normalise_address(entry['address'])
|
||||
entry["lat"] = ""
|
||||
entry["lon"] = ""
|
||||
return [entry]
|
||||
else:
|
||||
address_history = []
|
||||
entry = {}
|
||||
|
||||
Reference in New Issue
Block a user