VISUALIZING THE INTER-RELATIONSHIPS BETWEEN DATA IN A MATRIX.
Easy way to interact with and analyze matrix data. The above image (created in jupyterlab) shows over 7,000 dessert recipe flavor pairings. Outer rim sizes get larger the more combinations they have.
WHERE’S THE CODE?!
import pandas as pd
import numpy as np
import sqlalchemy as sq
import datetimeimport plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplotsimport geopandas as gp
import datetime as dt
import itertools
%matplotlib inline
import ssl
from chord import Chordssl._create_default_https_context = ssl._create_unverified_contextpd.options.display.max_rows = 999
pd.set_option("display.float_format", lambda x: "%.5f" % x)df = pd.read_csv("recipes.csv")
print(df.shape)df.isna().sum()df = df.dropna()df["flavor_list"] = df["flavors"].apply(lambda x: sorted(list(set(x.split()))))df.rating.hist()df.success.hist()#Filter on > 2 rating and at least 1 person having reported success making it
df = df[df.rating > 2]
print(df.shape)df = df[df.success >= 0.1]
print(df.shape)#Clean up
df["recipe_list"] = df["recipe_name"].apply(
lambda x: sorted(list(set(x.replace("-", " ").split())))
)
df["recipe_list"] = df["recipe_list"].apply(lambda l: [x.lower() for x in l])#We want to use words in recipe in flavor list (like caramel)
df["flavor_list2"] = df["flavor_list"] + df["recipe_list"]df["flavor_list2"] = df["flavor_list2"].apply(lambda x: sorted(list(set(x))))#Remove bad flavor values from flavor_list2
REMOVE_FLAVORS = [
"with",
"and",
"bundt",
"cake",
"cream",
"pie",
"ice",
"sauce",
"cookies",
"tart",
"pudding",
"allspice",
"butter",
"oat",
"cheesecake",
"white",
"fruit",
"graham",
"raisin",
]all_f_lists = []
for i in range(len(df)):
f_list3 = []
f_list2 = df["flavor_list2"].iloc[i]
for flavor in f_list2:
# if the bad flavor is not in the f_list2
if flavor == "expresso":
f_list3.append("espresso")
elif flavor == "cocoa":
f_list3.append("chocolate")
elif flavor == "peanut":
f_list3.append("peanut butter")elif flavor not in REMOVE_FLAVORS:
f_list3.append(flavor)all_f_lists.append(f_list3)df["flavor_list3"] = all_f_listsdf.head(2)Now get value_count (rank) of each flavorflavors = []
for flavor_list in df["flavor_list3"].values:
flavors = flavors + flavor_listflavor_series = pd.Series(flavors)flavor_series.nunique()#Look at top dessert flavors
#Look at cooccurence of dessert flavors
flavor_cdf = pd.DataFrame(flavor_series.value_counts()).reset_index()
flavor_cdf.columns = ["flavor", "recipe_counts"]# flavor_cdf.isna().sum()
flavor_cdf[
flavor_cdf["flavor"].isin(
["peanut_butter", "peanut", "espresso", "cocoa", "expresso"]
)
]TOP_NUM_FLAVORS = 37flavor_cdf.head(TOP_NUM_FLAVORS)fig = go.Figure(
[
go.Bar(
x=flavor_cdf.head(TOP_NUM_FLAVORS).flavor,
y=flavor_cdf.head(TOP_NUM_FLAVORS).recipe_counts,
)
]
)
py.offline.iplot(fig)#Save only top flavors
# top50_flavors = list(flavor_cdf.head(50).flavor.values)
top_flavors = list(flavor_cdf.head(TOP_NUM_FLAVORS).flavor.values)## Chord Diagram
#Take the original dataframe and calcualte co occurence matrix
adf = df[["recipe_name", "flavor_list3"]]
adf.head()#Lets focus on the top 50 recipes
f_list = adf["flavor_list3"].iloc[0]f_listlist(itertools.combinations(f_list, 2))flavor_pairs_list = []
for f_list in adf["flavor_list3"].values:
flavor_pairs = list(itertools.combinations(f_list, 2))
flavor_pairs_list = flavor_pairs_list + flavor_pairsflavor_pdf = pd.DataFrame(flavor_pairs_list, columns=["flavor1", "flavor2"])
print(flavor_pdf.shape)flavor_pdf.head(2)# flavor_t50_pdf = flavor_pdf[(flavor_pdf.flavor1.isin(top50_flavors)) & (flavor_pdf.flavor2.isin(top50_flavors))]
flavor_top_pdf = flavor_pdf[
(flavor_pdf.flavor1.isin(top_flavors)) & (flavor_pdf.flavor2.isin(top_flavors))
]
print(flavor_top_pdf.shape)flavor_top_pdf.head(2)data = list(itertools.chain.from_iterable((i, i[::-1]) for i in flavor_top_pdf.values))matrix = pd.pivot_table(
pd.DataFrame(data), index=0, columns=1, aggfunc="size", fill_value=0
).values.tolist()mdf = pd.DataFrame(matrix)print(mdf.shape)
mdf.head(2)mdf.sum()names = np.unique(data).tolist()
name_df = pd.DataFrame(names, columns=["flavor_name"])
name_df# Chord(matrix, names, wrap_labels=False).show()color_dic = {
'almond': "#b06e31",
'apricot': "#fbceb1",
'apple': "#cf350e",
'banana': "#ffe135",
'blackberry': "#43182f",
'blueberry': "#4f86f7",
'brandy': "#87413f",
'caramel': "#C68E17",
'cardamom': "#958d34",
'cherry': "#8f0b0b",
'chocolate': "#622A0F",
'cinnamon': "#D2691E",
'clove': "#B5651D",
'coconut': "#E1DABB",
'coffee': "#3B270C",
'cranberry': "#950714",
'ginger': "#b06500",
'hazelnut': "#ae9f80",
'honey': "#ebb028",
'lemon': "#e8ce25",
'lime': "#5ce825",
'maple': "#bb9351",
'mint': "#98ff98",
'nutmeg': "#7e4a3b",
'orange': "#ffa500",
'peach': "#ffe5b4",
'peanut butter': "#cd9141",
'pear': "#d1e231",
'pecan': "#48260D",
'pineapple': "#e6ae25",
'pistachio': "#93c572 ",
'pumpkin': "#ff7518",
'raspberry': "#db1f1f",
'rum': "#D7C5A9",
'strawberry': "#FC5A8D",
'vanilla': "#F9E5BC",
'walnut': "#43270F"
}name_df["color"] = name_df["flavor_name"].map(color_dic)name_dfcolors = list(name_df.color.values)#Chord(matrix, names, colors=colors, wrap_labels=False).show()Chord(matrix, names, colors=colors, wrap_labels=False).show()Chord(matrix, names, colors=colors, wrap_labels=False).to_html()
HOW ELSE COULD THIS BE USEFUL?
There’s a million ways this could be used, but one that comes to mind, on a fraud/risk perspective, is to identify the co-occurrence of high volume and heavy weighted rules.
How do you use Chord Diagrams?
Social Media:
Instagram: @maxbade
Github: @supercoolgetsallthegirlsmax