Les règles d'association sont un algorithme d'apprentissage non supervisé. Il est utilisé pour analyser les habitudes d'achats des clients.
U itemset est un utiliser l'algorithme du KMeans pour former un modèle, il faut des données numériques.
Le support d'un itemset est la fréquence d'un itemset.
# !pip install -q pandas mlxtend
import re import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
DATA_PATH = "/kaggle/input/sales-forecasting/train.csv" SUPPORT = 0.2
sales_data = pd.read_csv(DATA_PATH) sales_data.sample(10)
sales_data.shape
(9800, 18)
sales_data.describe()
sales_data.info()
<class 'pandas.core.frame.dataframe'=""> RangeIndex: 9800 entries, 0 to 9799 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9800 non-null int64 1 Order ID 9800 non-null object 2 Order Date 9800 non-null object 3 Ship Date 9800 non-null object 4 Ship Mode 9800 non-null object 5 Customer ID 9800 non-null object 6 Customer Name 9800 non-null object 7 Segment 9800 non-null object 8 Country 9800 non-null object 9 City 9800 non-null object 10 State 9800 non-null object 11 Postal Code 9789 non-null float64 12 Region 9800 non-null object 13 Product ID 9800 non-null object 14 Category 9800 non-null object 15 Sub-Category 9800 non-null object 16 Product Name 9800 non-null object 17 Sales 9800 non-null float64 dtypes: float64(2), int64(1), object(15) memory usage: 1.3+ MB
new_columns = [re.sub(' |-', '_', column).strip() for column in list(sales_data.columns)] sales_data.columns = new_columns sales_data.head()
sales_data.columns
Index(['Row_ID', 'Order_ID', 'Order_Date', 'Ship_Date', 'Ship_Mode', 'Customer_ID', 'Customer_Name', 'Segment', 'Country', 'City', 'State', 'Postal_Code', 'Region', 'Product_ID', 'Category', 'Sub_Category', 'Product_Name', 'Sales'], dtype='object')
def create_cart(item): return [elt for elt in list(item)]
df = sales_data[["Customer_ID", "Sub_Category"]].groupby(by="Customer_ID").agg({"Sub_Category": create_cart}).reset_index().rename(columns={"Sub_Category": "cart"}) # df = sales_data[["Customer_ID", "Product_Name"]].groupby(by="Customer_ID").agg({"Product_Name": create_cart}).reset_index().rename(columns={"Product_Name": "cart"}) df.head()
df['cart_size'] = df['cart'].apply(lambda x: len(x)) df.sample(15)
X = df['cart'].values
te = TransactionEncoder() te_ary = te.fit(X).transform(X) df_x = pd.DataFrame(te_ary, columns=te.columns_) df_x.head()
frequent_itemsets = fpgrowth(df_x, min_support=SUPPORT, use_colnames=True) #### alternatively: #frequent_itemsets = apriori(df, min_support=SUPPORT, use_colnames=True) # frequent_itemsets = fpmax(df, min_support=SUPPORT, use_colnames=True) frequent_itemsets.sort_values(by='support', ascending=False).head(10)
from mlxtend.frequent_patterns import association_rules association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7).head(10)
Blog sur le Data Mining
Librairie Python de Machine Learning
Association Rules
Association rules generation from frequent itemsets
Règles d'association
Extraction de règles d’association
import mlxtend
print(mlxtend.__version__)
0.22.0
import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori, association_rules
DATASET_PATH = "Data/Sample - Superstore.xls"
data = pd.read_excel(DATASET_PATH, sheet_name="Orders") print(data.head().T)
0 ... 4 Row ID 1 ... 5 Order ID CA-2016-152156 ... US-2015-108966 Order Date 2016-11-08 00:00:00 ... 2015-10-11 00:00:00 Ship Date 2016-11-11 00:00:00 ... 2015-10-18 00:00:00 Ship Mode Second Class ... Standard Class Customer ID CG-12520 ... SO-20335 Customer Name Claire Gute ... Sean O'Donnell Segment Consumer ... Consumer Country United States ... United States City Henderson ... Fort Lauderdale State Kentucky ... Florida Postal Code 42420 ... 33311 Region South ... South Product ID FUR-BO-10001798 ... OFF-ST-10000760 Category Furniture ... Office Supplies Sub-Category Bookcases ... Storage Product Name Bush Somerset Collection Bookcase ... Eldon Fold 'N Roll Cart System Sales 261.96 ... 22.368 Quantity 2 ... 2 Discount 0.0 ... 0.2 Profit 41.9136 ... 2.5164 [21 rows x 5 columns]
print(data.shape)
(9994, 21)
print(data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9994 non-null int64 1 Order ID 9994 non-null object 2 Order Date 9994 non-null datetime64[ns] 3 Ship Date 9994 non-null datetime64[ns] 4 Ship Mode 9994 non-null object 5 Customer ID 9994 non-null object 6 Customer Name 9994 non-null object 7 Segment 9994 non-null object 8 Country 9994 non-null object 9 City 9994 non-null object 10 State 9994 non-null object 11 Postal Code 9994 non-null int64 12 Region 9994 non-null object 13 Product ID 9994 non-null object 14 Category 9994 non-null object 15 Sub-Category 9994 non-null object 16 Product Name 9994 non-null object 17 Sales 9994 non-null float64 18 Quantity 9994 non-null int64 19 Discount 9994 non-null float64 20 Profit 9994 non-null float64 dtypes: datetime64[ns](2), float64(3), int64(3), object(13) memory usage: 1.6+ MB None
data = data[["Customer ID", "Sub-Category"]] print(data.sample(10))
Customer ID Sub-Category 6496 KH-16510 Accessories 484 MT-18070 Labels 855 BK-11260 Paper 227 DS-13180 Storage 6546 DP-13000 Paper 822 TR-21325 Accessories 6610 AG-10495 Binders 2434 PJ-18835 Envelopes 6121 JR-16210 Bookcases 5197 AI-10855 Phones
print(data.describe())
Customer ID Sub-Category count 9994 9994 unique 793 17 top WB-21850 Binders freq 37 1523
flatten = lambda cart: list(cart) carts = data.groupby(by="Customer ID").agg({"Sub-Category": flatten}).reset_index().rename(columns={"Sub-Category": "Chart"}) print(carts.head(10))
Customer ID Chart 0 AA-10315 [Appliances, Binders, Storage, Binders, Applia... 1 AA-10375 [Storage, Furnishings, Accessories, Binders, A... 2 AA-10480 [Paper, Furnishings, Paper, Storage, Paper, Pa... 3 AA-10645 [Chairs, Phones, Chairs, Furnishings, Envelope... 4 AB-10015 [Chairs, Art, Storage, Storage, Phones, Bookca... 5 AB-10060 [Accessories, Paper, Binders, Paper, Binders, ... 6 AB-10105 [Tables, Furnishings, Binders, Phones, Labels,... 7 AB-10150 [Accessories, Paper, Supplies, Art, Furnishing... 8 AB-10165 [Accessories, Paper, Art, Art, Paper, Binders,... 9 AB-10255 [Phones, Storage, Accessories, Supplies, Paper...
encoder = TransactionEncoder() encoder.fit(carts["Chart"]) chart_array = encoder.transform(carts["Chart"]) df_chart = pd.DataFrame(chart_array, columns=encoder.columns_, dtype=int) print(df_chart.head(10))
Accessories Appliances Art Binders Bookcases Chairs ... Machines Paper Phones Storage Supplies Tables 0 1 1 0 1 0 0 ... 0 1 1 1 1 0 1 1 0 1 1 0 0 ... 0 1 1 1 0 0 2 1 0 1 0 0 0 ... 0 1 1 1 0 1 3 0 0 1 1 1 1 ... 0 1 1 1 0 0 4 0 0 1 0 1 1 ... 0 0 1 1 0 0 5 1 1 0 1 0 1 ... 0 1 0 0 1 1 6 1 0 1 1 0 0 ... 1 0 1 1 0 1 7 1 0 1 1 0 0 ... 0 1 0 0 1 0 8 1 0 1 1 0 1 ... 0 1 0 1 0 0 9 1 0 1 1 0 0 ... 0 1 1 1 1 0 [10 rows x 17 columns]
frequent_itemsets = apriori(df_chart, min_support=0.45, use_colnames=True) print(frequent_itemsets)
support itemsets 0 0.597730 (Accessories) 1 0.622951 (Art) 2 0.819672 (Binders) 3 0.513241 (Chairs) 4 0.665826 (Furnishings) 5 0.770492 (Paper) 6 0.644388 (Phones) 7 0.648172 (Storage) 8 0.508197 (Accessories, Binders) 9 0.491803 (Paper, Accessories) 10 0.523329 (Art, Binders) 11 0.493064 (Paper, Art) 12 0.553594 (Furnishings, Binders) 13 0.650694 (Paper, Binders) 14 0.532156 (Binders, Phones) 15 0.546028 (Storage, Binders) 16 0.535939 (Paper, Furnishings) 17 0.450189 (Furnishings, Phones) 18 0.520807 (Paper, Phones) 19 0.535939 (Paper, Storage) 20 0.451450 (Paper, Furnishings, Binders) 21 0.460277 (Paper, Storage, Binders)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) rules = rules[["antecedents", "consequents", "support", "confidence", "lift", "leverage"]] # rules print(rules)
antecedents consequents support confidence lift leverage 0 (Accessories) (Binders) 0.508197 0.850211 1.037257 0.018254 1 (Accessories) (Paper) 0.491803 0.822785 1.067870 0.031257 2 (Art) (Binders) 0.523329 0.840081 1.024899 0.012714 3 (Art) (Paper) 0.493064 0.791498 1.027263 0.013086 4 (Furnishings) (Binders) 0.553594 0.831439 1.014356 0.007835 5 (Paper) (Binders) 0.650694 0.844517 1.030311 0.019143 6 (Binders) (Paper) 0.650694 0.793846 1.030311 0.019143 7 (Phones) (Binders) 0.532156 0.825832 1.007515 0.003969 8 (Storage) (Binders) 0.546028 0.842412 1.027743 0.014740 9 (Furnishings) (Paper) 0.535939 0.804924 1.044689 0.022926 10 (Phones) (Paper) 0.520807 0.808219 1.048965 0.024311 11 (Storage) (Paper) 0.535939 0.826848 1.073143 0.036529 12 (Paper, Furnishings) (Binders) 0.451450 0.842353 1.027671 0.012156 13 (Furnishings, Binders) (Paper) 0.451450 0.815490 1.058402 0.024911 14 (Paper, Storage) (Binders) 0.460277 0.858824 1.047765 0.020983 15 (Paper, Binders) (Storage) 0.460277 0.707364 1.091323 0.038516 16 (Storage, Binders) (Paper) 0.460277 0.842956 1.094049 0.039568 17 (Storage) (Paper, Binders) 0.460277 0.710117 1.091323 0.038516