diff --git a/Scripts/contactConnectivity.py b/Scripts/contactConnectivity.py new file mode 100644 index 0000000..e64318d --- /dev/null +++ b/Scripts/contactConnectivity.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Working package********************* +# Analysis of multiple businesses in linked to one financial or authorized contact +# This notebook makes multiple analysis. The analysis are splitted into various sections. +# They are splittted connectibity via authorized financial or authorized contact.# +# +# Import usedful packages +from itertools import combinations +import pandas as pd +import numpy as np +import pickle +import string +import csv +import os + +# --------------- Helper functions -------------------------------- + +# Updated 15/08 +def load_data(path, sheet=None, ignore_hash=True): + """Function to load data from an excel file + + Args: + path (string): contain the path to the file + sheet (string, optional): Name of sheet to read in a multisheet. Defaults to None. + + Returns: + pandas dataframe: The loaded data + """ + if sheet: + xls = pd.ExcelFile(path) + data = pd.read_excel(xls, sheet) + else: + data = pd.read_excel(path) + # Remove n columns are system hash, remove them + if ignore_hash: + data = data[[col for col in data.columns if not '(Do Not Modify)' in col]] + + data.dropna(axis=1, how='all', inplace=True) + #Get dataframe with distince columns calues + data = data[[c for c in data.columns if len(data[c].unique()) > 1]] + + return data + + +def select_column(df, colnames): + """Get dataframe with columns colname name + + Args: + df (dataframe): data from which a columns are selected + colnames (list of columns): list of columns + + Returns: + dataframe: data with columns colnames + """ + + return df[colnames] + + +def add_column(d, cols='', new_col='ContactName'): + """Adds a new column to the df by combining two columns in the application + + Args: + d (dataframe): Data to which a new colum is to be added + cols (str, optional): Name of new column. Defaults to ''. + new_col (str, optional): Name of new column. Defaults to 'ContactName'. + + Returns: + dataframe: The dataframe with new column added + """ + if not cols: + cols = ['First Name', 'Last Name'] + df = d.copy() + df[new_col] = df[cols[0]].map(str) + ' ' + df[cols[1]].map(str) + + return df + + +def filter_data(df, col_name, row_vals, select=True): + """Filter for values of col in row_vals + + Args: + df (pd.dataframe): Data to operate on + col_name (str): column name + row_vals (list): list if column values to operate on + select (bool, optional): Decision to select or exclude on row_vals. Defaults to True. + + Returns: + data: list of values + """ + + if select: + data = df[df[col_name].isin(row_vals)] + else: + data = df[~df[col_name].isin(row_vals)] + + return data + +def multiple_app(df, group_col, filter_col): + """Takes a dataframe and column name and indices that satisfy a condition + + Args: + df (pd.dataframe): The dataframe tp process + group_col (str or list): column(s) to group by + filter_col (str): column to filter on + + Returns: + data: Dataframe of processed data + """ + + selected = df.groupby( + group_col, as_index=False + )[filter_col].count().sort_values([filter_col], ascending=False) + filtered = selected[selected[filter_col]>1] + multiple_vals = filtered[group_col].unique() + + return multiple_vals + +# Putting everything together +def get_apps(df, exclude_status): + """Putting all the functions together to get the apps + + Args: + df (pd.dataframe): dataframe to process + exclude_status (list): List of values to exclude + + Returns: + dataframe: the processed applications + """ + + data_with_excluded_col = filter_data(df, 'Application Status', exclude_status, select=False) + #print(data_with_excluded_col.head(2)) + fin_contact_mask = multiple_app(data_with_excluded_col, 'FinContactName', 'Reference Number') + auth_contact_mask = multiple_app(data_with_excluded_col, 'AuthContactName', 'Reference Number') + + # Filter original data for these with excluded cols + app_fin_contact = filter_data(data_with_excluded_col, 'FinContactName', fin_contact_mask) + app_auth_contact = filter_data(data_with_excluded_col, 'AuthContactName', auth_contact_mask) + #combine the data and remove duplicstes + combined_data = pd.concat([app_fin_contact, app_auth_contact]).drop_duplicates('Reference Number') + + return combined_data.sort_values(by=['AuthContactName', 'FinContactName']) + +#Created 15/08 Tested to get only unique dplicates according to columns +def get_duplicates(df, subset=None, unique=False): + """Takes a dataframe and a set of column names and return unque duplicates + + Args: + df (pd.dataframe): The dataframe tp process + subset (str or list): column(s) to get duplicates on + unique (bool): Defaults to false so all duplicated are returned. If true only unique list of + duplicates are returned + + Returns: + duplicates: Dataframe of duplicates + """ + #mask = df.duplicated(subset=subset, keep=False) + if unique: + duplicates = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset) + else: + duplicates = df[df.duplicated(subset=subset, keep='first')] + + #duplicates = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset) + + return duplicates.sort_values(by=subset, ascending=False) +#------------- end helpers ----------------------------------------------- + +def get_non_duplicates(df, subset=None, keep='first'): + """Takes a dataframe and a set of column names and return unique rows + + Args: + df (pd.dataframe): The dataframe tp process + subset (str or list): column(s) to get duplicates on + + Returns: + data: Dataframe of processed data + """ + + non_duplicates = df.drop_duplicates(subset=subset, keep=keep) + + return non_duplicates + +def get_non_duplicate_contacts(df, subset=None, keep='first'): + """Takes a dataframe and a set of column names and return unique rows + + Args: + df (pd.dataframe): The dataframe tp process + subset (str or list): column(s) to get duplicates on + + Returns: + non_duplicates: Dataframe of processed data + """ + + #add column of row counts for nan. This will be used to determine which is upto date + data = df.drop_duplicates() + data['NanCount'] = data.isnull().sum(axis=1) + data.sort_values(by=[' Full Name', 'NanCount'], ascending=False, inplace=True) + #drop column of row counts for nan + non_duplicates = data.drop('NanCount', axis=1).drop_duplicates(subset=subset, keep=keep) + + #non_duplicates.sort_values(by=[' Full Name'], ascending=False, inplace=True) + + return non_duplicates.reset_index() + +# Sept 1, 2020 +def get_connectivity(df, rel_col): + """Takes in a dataframe and columns to groupby + + Args: + df (pd.DataFrame): data of applications + rel_col (list of columns): columns to groupby and get the relationship + + Yields: + connection, relationship: tuple of connection and pd.dataframe of items conneted to it + """ + drop_rel_cols = [ 'FinContactName', 'AuthContactName'] + gby = df.groupby(rel_col) + for connect, frame in gby: + """ + Takes columns of a pd.dataframe and create a tuple of combinations of rows for example + [r1; r2; r3] => [(r1, r2), (r1, r3), (r2, r3)]. If only one item, returns []. + """ + comb = combinations(frame.drop(drop_rel_cols, axis=1).values, 2) + if comb: + for related in comb: + yield connect, related + + + +def combine_account(combined, relationship): + """Connects account together via some relations + + Args: + combined (connection, pd.dataframe): tuple of connectivity and pd.dataframe + of items connected to it + relationship (string): Type of relationship that exist between items in pd.dataframe + + Returns: + pd.dataframe: dataframe in which the related items are place on a row with a + relationship tuple added + """ + + #Place holder for the relationship + df = pd.DataFrame(columns=[ + 'Reference Number B1', 'Business Number B1', 'Legal Name B1', 'Operating Name B1', + 'Reference Number B2', 'Business Number B2', 'Legal Name B2', 'Operating Name B2', + 'Related Contact Names', 'Relationship Types' + ]) + + for contact, connected_business in combined: + conected_account = np.concatenate( + (connected_business[0], connected_business[1], [contact, relationship]), + axis=None + ) + row = {key: val for key, val in zip(df.columns, conected_account)} + + df = df.append(row, ignore_index=True) + + return df + +# 03/04/2020 +def get_contact_difference(system_file, external_file, on): + """Takes two pd.dataframe objects and create complement file of external_file. That is data that is in + external_file and not in system_file + + Args: + system_file (pd.dataframe): data that is present in the system + external_file (pd.dataframe): external file to be imported + on (list): columns to base the comparison on + + Returns: + pd.dataframe: the difference data equivalent to B n A - A for set A and B + """ + + common_contacts = pd.merge(system_file[on], external_file, on=on) #Intersection + contacts_to_import = pd.concat( + [common_contacts, external_file], sort=False + ).drop_duplicates(keep=False) #Use intersection and original file to get the difference + + return contacts_to_import +#------------- end helpers ----------------------------------------------- diff --git a/Scripts/detecting-apps-connectivity.py b/Scripts/detecting-apps-connectivity.py new file mode 100644 index 0000000..febbe99 --- /dev/null +++ b/Scripts/detecting-apps-connectivity.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +# coding: utf-8 + +# +# Analysis of multiple businesses in linked to one financial or authorized contact +# This notebook makes multiple analysis. The analysis are splitted into various sections. +# They are splittted connectibity via authorized financial or authorized contact.# +# +# Import usedful packages + +import pandas as pd +import pickle +import string +import csv +import os + +# --------------- Helper functions -------------------------------- + +def load_data(path, sheet=None): + """Function to load data from an excel file + + Args: + path (string): contain the path to the file + sheet (string, optional): Name of sheet to read in a multisheet. Defaults to None. + + Returns: + pandas dataframe: The loaded data + """ + if sheet: + xls = pd.ExcelFile(path) + data = pd.read_excel(xls, sheet) + else: + data = pd.read_excel(path) + # if the first n columns are system hash, remove them with the second command + return data [data.columns[3:]] + + +def select_column(df, colname): + """Get dataframe with columns colname name + + Args: + df (dataframe): data from which a columns are selected + colname (list of columns): list of columns + + Returns: + dataframe: data with columns colname + """ + + return df[colname] + + +def add_column(d, cols='', new_col='ContactName'): + """Adds a new column to the df by combining two columns in the application + + Args: + d (dataframe): Data to which a new colum is to be added + cols (str, optional): Name of new column. Defaults to ''. + new_col (str, optional): Name of new column. Defaults to 'ContactName'. + + Returns: + dataframe: The dataframe with new column added + """ + if not cols: + cols = ['First Name', 'Last Name'] + df = d.copy() + df[new_col] = df[cols[0]].map(str) + ' ' + df[cols[1]].map(str) + + return df + + +def filter_data(df, col_name, row_vals, select=True): + """Filter for values of col in row_vals + + Args: + df (pd.dataframe): Data to operate on + col_name (str): column name + row_vals (list): list if column values to operate on + select (bool, optional): Decision to select or exclude on row_vals. Defaults to True. + + Returns: + data: list of values + """ + + if select: + data = df[df[col_name].isin(row_vals)] + else: + data = df[~df[col_name].isin(row_vals)] + + return data + +def multiple_app(df, group_col, filter_col): + """Takes a dataframe and column name and indices that satisfy a condition + + Args: + df (pd.dataframe): The dataframe tp process + group_col (str or list): column(s) to group by + filter_col (str): column to filter on + + Returns: + data: Dataframe of processed data + """ + + selected = df.groupby( + group_col, as_index=False + )[filter_col].count().sort_values([filter_col], ascending=False) + filtered = selected[selected[filter_col]>1] + multiple_vals = filtered[group_col].unique() + + return multiple_vals + +# Putting everything together +def get_apps(df, exclude_status): + """Putting all the functions together to get the apps + + Args: + df (pd.dataframe): dataframe to process + exclude_status (list): List of values to exclude + + Returns: + dataframe: the processed applications + """ + + data_with_excluded_col = filter_data(df, 'Application Status', exclude_status, select=False) + #print(data_with_excluded_col.head(2)) + fin_contact_mask = multiple_app(data_with_excluded_col, 'FinContactName', 'Reference') + auth_contact_mask = multiple_app(data_with_excluded_col, 'AuthContactName', 'Reference') + + # Filter original data for these with excluded cols + app_fin_contact = filter_data(data_with_excluded_col, 'FinContactName', fin_contact_mask) + app_auth_contact = filter_data(data_with_excluded_col, 'AuthContactName', auth_contact_mask) + #combine the data and remove duplicstes + combined_data = pd.concat([app_fin_contact, app_auth_contact]).drop_duplicates('Reference') + + return combined_data.sort_values(by=['AuthContactName', 'FinContactName']) + + +#------------- end helpers ----------------------------------------------- + +data_path = r"data.xlsx" +sheet = "Application Advanced Find View" +raw_data = load_data(data_path, sheet=sheet) + +#Example +app_col = ['Reference', 'Business Number', 'LegalName', 'OperatingName', + 'First Name', 'Last Name', 'Application Status', 'Authorized Contact Email', + 'Authorized Contact First Name', 'Authorized Contact Last Name', + 'Authorized Contact Title', 'Authorized Telephone Number'] + + +sel_data = select_column(raw_data, app_col) +authorized_contact = ['Authorized Contact First Name', 'Authorized Contact Last Name'] +financial_contact = ['First Name', 'Last Name'] +d1 = add_column(sel_data, authorized_contact, 'AuthContactName') +data_with_added_col = add_column(d1, financial_contact, 'FinContactName') + + +exclude_status = ['Application Incomplete', 'Duplicate Application'] +data_ex_incomplete_duplicates = get_apps(data_with_added_col, exclude_status) +print(data_ex_incomplete_duplicates.head(2)) +print(data_ex_incomplete_duplicates.shape) + +#data_ex_incomplete_duplicates.to_csv('data_ex_incomplete_duplicates-08-06.csv') + + +exclude_status = ['Application Ineligible', + 'Application Incomplete', + 'Not Supported', + 'Duplicate Application'] + +data_ex_incomplete_duplicates_notsup = get_apps(data_with_added_col, exclude_status) +print(data_ex_incomplete_duplicates_notsup.head(2)) +print(data_ex_incomplete_duplicates_notsup.shape) + + +#data_ex_incomplete_duplicates_notsup.to_csv('data_ex_incomplete_duplicates_notsup_ineli-08-06.csv') diff --git a/Scripts/getConnectedApp.py b/Scripts/getConnectedApp.py new file mode 100644 index 0000000..2bef722 --- /dev/null +++ b/Scripts/getConnectedApp.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# coding: utf-8 + +# +# Analysis of multiple businesses in linked to one financial or authorized contact +# This notebook makes multiple analysis. The analysis are splitted into various sections. +# They are splittted connectibity via authorized financial or authorized contact.# +# +# Import usedful packages +from contactConnectivity import load_data, get_non_duplicate_contacts, get_duplicates, add_column,\ + get_apps, select_column, combine_account, get_connectivity +import pandas as pd +import numpy as np +import pickle +import string +import csv +import os + + +#------------- end helpers ----------------------------------------------- + +data_path = r'C:\Users\Downloads\Application Advanced Find View 2020-08-29 2-29-22 PM.xlsx' +sheet = "Application Advanced Find View" +raw_data = load_data(data_path, sheet=sheet) + + +app_col = [ + 'Reference Number', 'Business Number', 'Legal Name', 'Operating Name', + 'First Name', 'Last Name', 'Application Status', + 'Authorized Business Contact Email', 'Authorized Business Contact First Name', + 'Authorized Business Contact Last Name', 'Authorized Business Contact Title', + 'Authorized Business Telephone Number' + ] + + +sel_data = select_column(raw_data, app_col) +authorized_contact = ['Authorized Business Contact First Name', 'Authorized Business Contact Last Name'] +financial_contact = ['First Name', 'Last Name'] +d1 = add_column(sel_data, authorized_contact, 'AuthContactName') +data_with_added_col = add_column(d1, financial_contact, 'FinContactName') + +#data_ex_incomplete_duplicates.to_csv('data_ex_incomplete_duplicates-08-06.csv') + + +exclude_status = ['Application Ineligible | Demande non admissible', 'Application Incomplete | Demande incomplète', + 'Not Supported | Non soutenus', 'Duplicate Application | Double de la demande'] + +data_ex_incomplete_duplicates_notsup = get_apps(data_with_added_col, exclude_status) +#print(data_ex_incomplete_duplicates_notsup.head(2)) +print(data_ex_incomplete_duplicates_notsup.shape) + +print() +print('-----------------------------------------------------------------------------------') + + + + +dropcols = ['First Name', 'Last Name', 'Application Status', + 'Authorized Business Contact Email', + 'Authorized Business Contact First Name', + 'Authorized Business Contact Last Name', + 'Authorized Business Contact Title', + 'Authorized Business Telephone Number' + ] +related_fin_contact = get_connectivity( + data_ex_incomplete_duplicates_notsup.drop(dropcols, axis=1), 'FinContactName' + ) +related_aut_contact = get_connectivity( + data_ex_incomplete_duplicates_notsup.drop(dropcols, axis=1), 'AuthContactName' + ) + +fc = combine_account(related_fin_contact, 'Financial Contact') +ac = combine_account(related_aut_contact, 'Authorized Contact') + +dd = pd.concat([fc, ac]) \ No newline at end of file diff --git a/Scripts/getDifferenceContactFiles.py b/Scripts/getDifferenceContactFiles.py new file mode 100644 index 0000000..14bfe2c --- /dev/null +++ b/Scripts/getDifferenceContactFiles.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Working package********************* +# The idea of this work is to compare two csv files which contain contacts information and identify the di +# difference. One file contain the contacts that are already in the system, and the other contains a list +# that should be added to the system. However, some of these contacts are already in the system. SO we want +# to identify those that are already in the system and then upload only the difference +# + +import pandas as pd +import numpy as np +from contactConnectivity import load_data +import csv + + +# file location +minerva_path = r'C:\Users\Downloads\Contacts Advanced Find View 2020-09-01 11-26-04 AM.xlsx' +isr_path = r'C:\Users\Documents\List Template_ISR updates.xlsx' +# --------------- Helper functions -------------------------------- + +# load +system_contact = load_data(minerva_path) +#format column headers to title format +system_contact.columns = [c.title() for c in system_contact.columns] + +#Load contacts to be imported from isr +isr_contact = data = pd.read_excel(isr_path, header=1) + +# ## 2. Common data +# The common data can be identified via an inner join. We start by a less regorous condition which is the +# first and last names. That leads to 362 contacts that are already in the system. + +common_contacts = pd.merge(system_contact[['First Name', 'Last Name']], isr_contact, on=['First Name', 'Last Name']) + +# Common contacts are those that are already in the system +# To get the difference that whould be imported, we use concat +contacts_to_import = pd.concat([common_contacts, isr_contact], sort=False).drop_duplicates(keep=False) +# contacts_to_import.to_csv('contacts_not_in_system_FL.csv', index=False) + + +# ## 3. Common data on three fields +# Here a more regorous condition which is the first and last names and email is used. That leads to 362 contacts +# that are already in the system. This leads to 193 +common_contacts_3 = pd.merge( + system_contact[['First Name', 'Last Name', 'Email']], + isr_contact, on=['First Name', 'Last Name', 'Email'] + ) + +contacts_to_import_3 = pd.concat([common_contacts_3, isr_contact], sort=False).drop_duplicates(keep=False) + +# Putting all together +def get_contact_difference(system_file, external_file, on): + """Takes two pd.dataframe objects and create complement file of external_file. That is data that is in + external_file and not in system_file + + Args: + system_file (pd.dataframe): data that is present in the system + external_file (pd.dataframe): external file to be imported + on (list): columns to base the comparison on + + Returns: + pd.dataframe: the difference data equivalent to B n A - A for set A and B + """ + + common_contacts = pd.merge(system_file[on], external_file, on=on) #Intersection + contacts_to_import = pd.concat( + [common_contacts, external_file], sort=False + ).drop_duplicates(keep=False) #Use intersection and original file to get the difference + + return contacts_to_import + + + + diff --git a/Scripts/location.py b/Scripts/location.py new file mode 100644 index 0000000..20de13c --- /dev/null +++ b/Scripts/location.py @@ -0,0 +1,109 @@ +# Program to read lat/lon of customers and represent them on a map surface. + +import random +import pandas as pd +import pydeck as pdk +from sklearn.datasets import make_blobs + +LATITUDE, LONGITUDE = 6.221153, 10.679615 # Map focus center +# 6.224683, 10.663074 + + +def get_data(): + data = pd.DataFrame( + make_blobs(n_samples=100, centers=[ + [6.221153, 10.679615]], n_features=2, cluster_std=0.05, random_state=0) + ) + data.columns = ['lat', 'lng'] + + return data + + +def scatterplot_map(*args, **kwargs): + """Plots using mapbox. There are two maps + The columns should be labelled as "lng", "lat" + And an optional tooltip name addr + """ + colors = kwargs.get('fill_colors', _generate_colors(len(args))[0]) + layers = [_layer(df, fill_color=color) for df, color in zip(args, colors)] + view_state = _view() + + return pdk.Deck( + map_style='light', + layers=layers, + initial_view_state=view_state, + # tooltip={"text": "{addr}"} + ) + + +def _layer(df, position=["lng", "lat"], fill_color=[242, 242, 242]): + """Creates a map layer for a given dataframe""" + # Define a layer to display on a map + layer = pdk.Layer( + "ScatterplotLayer", + df, + pickable=True, + opacity=0.9, + stroked=True, + filled=True, + radius_scale=10, + radius_min_pixels=4, + radius_max_pixels=50, + line_width_min_pixels=1, + get_position=position, + get_fill_color=fill_color, # [255, 140, 0],#[180, 0, 200, 140] + get_line_color=fill_color + ) + return layer + + +def _view(latitude=LATITUDE, longitude=LONGITUDE): + """Set the viewport location""" + return pdk.ViewState( + latitude=latitude, + longitude=longitude, + zoom=10, + min_zoom=5, + max_zoom=15, + bearing=0, + pitch=0 + ) + + +def _generate_colors(n): + """Generates n colors that are equally spaced + + Args: + n (int): number of colors required + + Returns: + list: list of list. One of rgb colors and other hex codes + """ + rgb_values = [] + hex_values = [] + r = int(random.random() * 256) + g = int(random.random() * 256) + b = int(random.random() * 256) + step = 256 / n + for _ in range(n): + r += step + g += step + b += step + r = int(r) % 256 + g = int(g) % 256 + b = int(b) % 256 + r_hex = hex(r)[2:] + g_hex = hex(g)[2:] + b_hex = hex(b)[2:] + hex_values.append('#' + r_hex + g_hex + b_hex) + rgb_values.append((r, g, b)) + + return rgb_values, hex_values + + +if __name__ == '__main__': + events_locations = get_data() # Customer location + face_location = pd.DataFrame( + {'lat': [6.224683], 'lng': [10.663074]}) # Office Center + r = scatterplot_map(events_locations, face_location) + r.to_html("kumbo.html") diff --git a/Scripts/word_cloud.py b/Scripts/word_cloud.py new file mode 100644 index 0000000..f07378d --- /dev/null +++ b/Scripts/word_cloud.py @@ -0,0 +1,48 @@ +""" + This program is generated to plot word cloud from a text fime of whatsapp chats. + Export the chats to text and put the file to the same dir as the file and name it + appropriately + """ + +from wordcloud import WordCloud, STOPWORDS +import matplotlib.pyplot as plt +import pandas as pd + +# reads whatsapp test file +with open(r"/home/siro/Downloads/ACR.txt", encoding="latin-1") as f: + text = f.readlines() + +comment_words = [] +names = [] +stopwords = set(STOPWORDS) + +for val in text: + tokens = [] + # Create token by splitting on the -. This separates date and name/message + date, *name_message = val.split(' - ') + comment_words.extend(name_message) + +t = [] +for w in comment_words: + name, *message = w.split(':') # separates sender and message + names.append(name) + t.extend(message) + +post_words = " ".join( + w for w in " ".join(t).split() if w.isalpha() +) + +plot_words = post_words +# plot_words = " ".join(names) +wordcloud = WordCloud(width=800, height=800, + background_color='white', + stopwords=stopwords, + min_font_size=10).generate(plot_words) + +# plot the WordCloud image +plt.figure(figsize=(8, 8), facecolor=None) +plt.imshow(wordcloud) +plt.axis("off") +plt.tight_layout(pad=0) + +plt.show()