From 7c9cf0e775767127dfa25df63ef948f5400a6e1e Mon Sep 17 00:00:00 2001 From: seangreaves Date: Tue, 31 Jan 2023 13:37:20 +0000 Subject: [PATCH] added tutorial connecting two companies --- .gitignore | 1 + notebooks/004_connection_check.ipynb | 363 +++++++++++++++++++++++++++ sugartrail/base.py | 33 +-- sugartrail/hop.py | 33 +-- sugartrail/processing.py | 15 ++ 5 files changed, 414 insertions(+), 31 deletions(-) create mode 100644 notebooks/004_connection_check.ipynb diff --git a/.gitignore b/.gitignore index 48809b0..2dd2637 100644 --- a/.gitignore +++ b/.gitignore @@ -51,5 +51,6 @@ docs/_build/ # Testing notebook notebooks/testing.ipynb +notebooks/investigations *.DS_Store diff --git a/notebooks/004_connection_check.ipynb b/notebooks/004_connection_check.ipynb new file mode 100644 index 0000000..8192684 --- /dev/null +++ b/notebooks/004_connection_check.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b7641405", + "metadata": {}, + "source": [ + "*In this tutorial we will investigate two seperate companies and check if they are connected.*" + ] + }, + { + "cell_type": "markdown", + "id": "e39bd44d", + "metadata": {}, + "source": [ + "There are instances where we may want to see if two companies are connected. We can do this by simply building a network for each company and comparing them to see if there are any common officers, addresses or companies.\n", + "\n", + "Lets test this approach with two example companies, Zahawi & Zahawi Ltd (07285998) and Gorgeous Services Limited (05714521):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "53435932", + "metadata": {}, + "outputs": [], + "source": [ + "import sugartrail\n", + "import pandas as pd\n", + "sugartrail.api.basic_auth.username = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "489a4141", + "metadata": {}, + "source": [ + "Create one network for Zahawi & Zahawi including some limits to reduce the number of possibly irrelevant connections:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "300cecde", + "metadata": {}, + "outputs": [], + "source": [ + "zahawi_connections = sugartrail.base.Network(company_id='07285998')\n", + "zahawi_connections.hop.officer_appointments_maxsize = 20\n", + "zahawi_connections.hop.officers_at_address_maxsize = 20\n", + "zahawi_connections.hop.companies_at_address_maxsize = 20" + ] + }, + { + "cell_type": "markdown", + "id": "bf8ddb84", + "metadata": {}, + "source": [ + "Create a second network for Gorgeous Services:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9480e020", + "metadata": {}, + "outputs": [], + "source": [ + "gorgeous_connections = sugartrail.base.Network(company_id='05714521')\n", + "gorgeous_connections.hop.officer_appointments_maxsize = 20\n", + "gorgeous_connections.hop.officers_at_address_maxsize = 20\n", + "gorgeous_connections.hop.companies_at_address_maxsize = 20" + ] + }, + { + "cell_type": "markdown", + "id": "fd678b28", + "metadata": {}, + "source": [ + "We can now pass both networks to the `find_network_connections` method which returns any connections found between two networks. The method accepts two networks as input and an optional `max_depth` value (defaults to 5) which sets the maximum depth of network we will build for both. `find_network_connections` builds each network up to the `max_depth` value and completes when connections are found or the `max_depth` is reached." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b4036e3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1/5 hops completed.\n", + "2/5 hops completed.\n", + "3/5 hops completed.\n", + "Found connection(s)!\n" + ] + } + ], + "source": [ + "connections = sugartrail.processing.find_network_connections(zahawi_connections, gorgeous_connections)" + ] + }, + { + "cell_type": "markdown", + "id": "bac64a8e", + "metadata": {}, + "source": [ + "Looks like a connection was found. We can see by the long string of characters that its an officer ID:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "be034584", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['g8BmvnpH8blqT87i93sgJeowx7I']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "connections" + ] + }, + { + "cell_type": "markdown", + "id": "6cd89faa", + "metadata": {}, + "source": [ + "We can now trace the path from Zahawi & Zahawi to this connection:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9544095a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titledepthnode_typeidlink_typelinknode_index
0ZAHAWI & ZAHAWI LTD0Company07285998a
1Nadhim ZAHAWI1PersontKup8kXPh3-jx_5Bs-BkF5XCyPMOfficerab
2YOUGOV PLC2Company03607311Appointmentbc
3Benjamin William ELLIOT3Persong8BmvnpH8blqT87i93sgJeowx7IOfficercd
\n", + "
" + ], + "text/plain": [ + " title depth node_type id \\\n", + "0 ZAHAWI & ZAHAWI LTD 0 Company 07285998 \n", + "1 Nadhim ZAHAWI 1 Person tKup8kXPh3-jx_5Bs-BkF5XCyPM \n", + "2 YOUGOV PLC 2 Company 03607311 \n", + "3 Benjamin William ELLIOT 3 Person g8BmvnpH8blqT87i93sgJeowx7I \n", + "\n", + " link_type link node_index \n", + "0 a \n", + "1 Officer a b \n", + "2 Appointment b c \n", + "3 Officer c d " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(zahawi_connections.find_path('g8BmvnpH8blqT87i93sgJeowx7I'))" + ] + }, + { + "cell_type": "markdown", + "id": "613910a7", + "metadata": {}, + "source": [ + "... and the path from Gorgeous Connections to the connection:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f810b714", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titledepthnode_typeidlink_typelinknode_index
0GORGEOUS SERVICES LIMITED0Company05714521a
1Benjamin William ELLIOT1Persong8BmvnpH8blqT87i93sgJeowx7IOfficerab
\n", + "
" + ], + "text/plain": [ + " title depth node_type id \\\n", + "0 GORGEOUS SERVICES LIMITED 0 Company 05714521 \n", + "1 Benjamin William ELLIOT 1 Person g8BmvnpH8blqT87i93sgJeowx7I \n", + "\n", + " link_type link node_index \n", + "0 a \n", + "1 Officer a b " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(gorgeous_connections.find_path('g8BmvnpH8blqT87i93sgJeowx7I'))" + ] + }, + { + "cell_type": "markdown", + "id": "3e6ffa85", + "metadata": {}, + "source": [ + "Reading both paths tells us how Zahawi & Zahawi connect to Gorgeous Connections." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sugartrail/base.py b/sugartrail/base.py index acf2591..8b30399 100644 --- a/sugartrail/base.py +++ b/sugartrail/base.py @@ -221,9 +221,9 @@ class Network: f.close def load(self, filename): - """Loads network stored in JSON format from '../assets/networks/'.""" + """Loads network stored in JSON format.""" if filename: - f = open(f'../assets/networks/{filename}') + f = open(f'{filename}') network_data = json.load(f) self.graph = network_data['graph'] self.company_records = network_data['company_records'] @@ -372,7 +372,7 @@ class Network: path = sugartrail.processing.asciiify_path(path) return path - def perform_hop(self, hops, company_data=None): + def perform_hop(self, hops, company_data=None, print_progress=True): """Gets companies, officers and addresses within n-degrees of seperation from current nodes, where n is the number of hops.""" hop_history = [] @@ -396,26 +396,29 @@ class Network: if address not in self.processed_addresses: self.hop.search_address(self, address, company_data) self.processed_addresses.append(address) - IPython.display.clear_output(wait=True) - print("Hop number: " + str(hop+1)) - print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.") + if print_progress: + IPython.display.clear_output(wait=True) + print("Hop number: " + str(hop+1)) + print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.") for j,company in enumerate(selected_companies): if company not in self.processed_companies: self.hop.search_company_id(self,company) self.processed_companies.append(company) - IPython.display.clear_output(wait=True) - print("Hop number: " + str(hop+1)) - print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") - print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies.") + if print_progress: + IPython.display.clear_output(wait=True) + print("Hop number: " + str(hop+1)) + print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") + print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies.") for k,officer in enumerate(selected_officers): if officer not in self.processed_officers: self.hop.search_officer_id(self,officer) self.processed_officers.append(officer) - IPython.display.clear_output(wait=True) - print("Hop number: " + str(hop+1)) - print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") - print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.") - print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.") + if print_progress: + IPython.display.clear_output(wait=True) + print("Hop number: " + str(hop+1)) + print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") + print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.") + print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.") self.maxsize_entities = [i for n, i in enumerate(self.maxsize_entities) if i not in self.maxsize_entities[n + 1:]] self.processed_officers, self.processed_companies, self.processed_addresses = [],[],[] self.n += 1 diff --git a/sugartrail/hop.py b/sugartrail/hop.py index 608a2ce..4a4d3a8 100644 --- a/sugartrail/hop.py +++ b/sugartrail/hop.py @@ -68,23 +68,24 @@ class Hop: # get company address history address_history = sugartrail.processing.build_address_history(company_id) # network.address_history.extend(address_history) - for address in address_history: - if 'address' in address: - network.address_history.append(address) - new_address = address['address'] - if new_address not in network.graph: - network.graph[new_address] = { - 'depth': network.n+1, - 'title': new_address, - 'node_type': "Address", - 'arcs': [] + if address_history: + for address in address_history: + if 'address' in address: + network.address_history.append(address) + new_address = address['address'] + if new_address not in network.graph: + network.graph[new_address] = { + 'depth': network.n+1, + 'title': new_address, + 'node_type': "Address", + 'arcs': [] + } + arc = { + 'arc_type': "Historic Address", + 'start_node': company_id } - arc = { - 'arc_type': "Historic Address", - 'start_node': company_id - } - if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1: - network.graph[new_address]['arcs'].append(arc) + if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1: + network.graph[new_address]['arcs'].append(arc) def search_officer_id(self, network, officer_id): """Gets officers, companies and addresses connected to input officer diff --git a/sugartrail/processing.py b/sugartrail/processing.py index 5e8363f..493cca8 100644 --- a/sugartrail/processing.py +++ b/sugartrail/processing.py @@ -110,6 +110,21 @@ def process_address_changes(address_changes): address_changes['items'][i]['description_values']['new_address'] = address_changes['items'][i-1]['description_values']['old_address'] return address_changes +def find_network_connections(first_network, second_network, max_depth=5): + """Returns a list of nodes connecting .""" + hops = 0 + while hops < max_depth: + first_network.perform_hop(1, print_progress=False) + second_network.perform_hop(1, print_progress=False) + hops += 1 + print(str(hops) + "/" + str(max_depth) + " hops completed.") + connectors = [x for x in list(filter(first_network.graph.__contains__, second_network.graph.keys())) if x] + if connectors: + print("Found connection(s)!") + return connectors + print("No connections found.") + return + def build_address_history(company_id): """Returns a list of dicts containing historic addresses for input company (company_id)."""