2016-07-28 5 views
2

Les règles d'association sont une technique très courante lorsque vous voulez savoir quels événements se produisent ensemble (comme le hamburger et le pain se vendent généralement ensemble). En marketing cette technique est utilisée pour trouver les produits complémentaires.Règles d'association inverses

Je suis à la recherche d'une technique pour extraire les "produits de substitution" et pour être c'est comme les règles de l'association inverse pour savoir quels événements sont moins susceptibles de se produire ensemble. Existe-t-il un algorithme ou une technique disponible dans Spark, R, Python, etc. pour cela?

Merci, Amir

Répondre

2

J'ai fait une mise en œuvre d'une très pratique pour la substitution Règle minière utilisant Teng, Hsieh and Chen (2002) pour R. Peut-être que cela peut vous aider à:

# Used packages: 
library(arules) 


SRM <- function(TransData, MinSup, MinConf, pMin, pChi, itemLabel, nTID){ 

# Packages ---------------------------------------------------------------- 

if (sum(search() %in% "package:arules") == 0) { 
stop("Please load package arules") 
} 

# Checking Input data ----------------------------------------------------- 
if (missing(TransData)) { 
    stop("Transaction data is missing") 
} 

if (is.numeric(nTID) == FALSE) { 
    stop("nTID has to be one numeric number for the count of  
Transactions") 
} 

    if (length(nTID) > 1) { 
    stop("nTID has to be one number for the count of Transactions") 
    } 

    if (is.character(itemLabel) == FALSE) { 
    stop("itemLabel has to be a character") 
    } 
    # Concrete Item sets --------------------------------------------------- 

    # adding complements to transaction data 
    compl_trans <- addComplement(TransData,labels = itemLabel) 
    compl_tab <- crossTable(compl_trans,"support") 
    compl_tab_D <- as.data.frame(compl_tab) 
    # ordering matrix 
    compl_tab_D <-   compl_tab_D[order(rownames((compl_tab))),order(colnames((compl_tab)))] 


    # Chi Value --------------------------------------------------------------- 


    # empty data frame for loop 

    complement_data <- data.frame(Chi = as.numeric(), 
          Sup_X.Y = as.numeric(), 
          X = as.character(), 
          Sup_X = as.numeric(), 
          Y = as.character(), 
          Sup_Y = as.numeric(), 
          CX = as.character(), 
          SupCX = as.numeric(), 
          CY = as.character(), 
          Sup_CY = as.numeric(), 
          Conf_X.CY = as.numeric(), 
          Sup_X.CY = as.numeric(), 
          Conf_Y.CX = as.numeric(), 
          SupY_CX = as.numeric()) 



    # first loop for one item 
    for (i in 1 : (length(itemLabel) - 1)) { 
    # second loop combines it with all other items 
    for (u in (i + 1) : length(itemLabel)) { 


    # getting chi value from Teng 
    a <- itemLabel[i] 
    b <- itemLabel[u] 
    ca <- paste0("!", itemLabel[i]) 
    cb <- paste0("!", itemLabel[u]) 

    chiValue <- nTID * (
    compl_tab[ca, cb]^2/(compl_tab[ca, ca] * compl_tab[cb, cb]) + 
     compl_tab[ca, b]^2/(compl_tab[ca, ca] * compl_tab[b, b]) + 
     compl_tab[a, cb]^2/(compl_tab[a, a] * compl_tab[cb, cb]) + 
     compl_tab[a, b]^2/(compl_tab[a, a] * compl_tab[b, b]) - 1) 



    # condition to be dependent 
    if (compl_tab[a, b] > compl_tab[a, a] * compl_tab[b, b] &&  chiValue >= qchisq(pChi, 1) && 
     compl_tab[a, a] >= MinSup && compl_tab[b, b] >= MinSup) { 



    chi_sup <- data.frame(Chi = chiValue, 
         Sup_X.Y = compl_tab[a, b], 
         X = a, 
         Sup_X = compl_tab[a, a], 
         Y = b, 
         Sup_Y = compl_tab[b, b], 
         CX = ca, 
         SupCX = compl_tab[ca, ca], 
         CY = cb, 
         Sup_CY = compl_tab[cb, cb], 
         Conf_X.CY = compl_tab[a, cb]/compl_tab[a, a], 
         Sup_X.CY = compl_tab[a, cb], 
         Conf_Y.CX = compl_tab[ca, b]/compl_tab[b, b], 
         SupY_CX = compl_tab[ca, b]) 


    try(complement_data <- rbind(complement_data, chi_sup)) 

    } 


    } 
    } 
    if (nrow(complement_data) == 0) { 
    stop("No complement item sets could have been found") 
    } 


    # changing mode of 
    complement_data$X <- as.character(complement_data$X) 
    complement_data$Y <- as.character(complement_data$Y) 


    # calculating support for concrete itemsets with all others and their complements ------------------- 


    ## with complements 
    matrix_trans <- as.data.frame(as(compl_trans, "matrix")) 

    sup_three <- data.frame(Items = as.character(), 
        Support = as.numeric()) 


    setCompl <- names(matrix_trans) 
    # 1. extracts all other values than that are not in the itemset 
    for (i in 1 : nrow(complement_data)) { 
    value <- setCompl[ !setCompl %in% c(complement_data$X[i], 
            complement_data$Y[i], 
            paste0("!", complement_data$X[i]), 
            paste0("!",complement_data$Y[i]))] 


    # 2. calculation of support 
    for (u in value) { 
    count <- sum(rowSums(matrix_trans[, c(complement_data$X[i],  complement_data$Y[i], u)]) == 3) 
    sup <- count/nTID 
    sup_three_items <- data.frame(Items =  paste0(complement_data$X[i], complement_data$Y[i], u), 
          Support=sup) 
    sup_three <- rbind(sup_three, sup_three_items) 
    } 

    } 

    # Correlation of single items------------------------------------------------------------- 


    # all items of concrete itemsets should be mixed for correlation 
    combis <- unique(c(complement_data$X, complement_data$Y)) 

    # empty object 
    rules<- data.frame(
    Substitute = as.character(), 
    Product = as.character(), 
    Support = as.numeric(), 
    Confidence = as.numeric(), 
    Correlation = as.numeric()) 

    # first loop for one item 
    for (i in 1 : (length(combis) - 1)) { 
    # second loop combines it with all other items 
    for (u in (i + 1) : length(combis)) { 

    first <- combis[i] 
    second <- combis[u] 

    corXY <- (compl_tab[first, second] - (compl_tab[first, first] *  compl_tab[second, second]))/
(sqrt((compl_tab[first, first] * (1 - compl_tab[first,first])) * 
     (compl_tab[second, second] * (1 - compl_tab[second, second])))) 


    # confidence 
    conf1 <- compl_tab[first, paste0("!", second)]/compl_tab[first, first] 
    conf2 <- compl_tab[second, paste0("!", first)]/compl_tab[second, second] 

    two_rules <- data.frame(
    Substitute = c(paste("{", first, "}"), 
       paste("{", second, "}")), 
    Product = c(paste("=>", "{", second, "}"), 
      paste("=>", "{", first, "}")), 
    Support = c(compl_tab[first, paste0("!", second)], compl_tab[second, paste0("!", first)]), 
    Confidence = c(conf1, conf2), 
    Correlation = c(corXY, corXY) 
    ) 

    # conditions 
    try({ 
    if (two_rules$Correlation[1] < pMin) { 
     if (two_rules$Support[1] >= MinSup && two_rules$Confidence[1] >= MinConf) { 
     rules <- rbind(rules, two_rules[1, ]) 
} 
     if (two_rules$Support[2] >= MinSup && two_rules$Confidence[2] >= MinConf) { 
     rules <- rbind(rules, two_rules[2, ]) 
     } 

    } }) 

    } 
    } 


    # Correlation of concrete item pairs with single items -------------------- 
    # adding variable for loop 
    complement_data$XY <- paste0(complement_data$X, complement_data$Y) 

    # combination of items 
    for (i in 1 : nrow(complement_data)){ 

    # set of combinations from dependent items with single items 
    univector <- c(as.vector(unique(complement_data$X)),  as.vector(unique(complement_data$Y))) 
    univector <- univector[!univector %in% c(complement_data$X[i], complement_data$Y[i])] 

    combis <- c(complement_data[i,"XY"], univector) 



    for (u in 2 : length(combis)) { 
    corXYZ <-(sup_three[sup_three$Items == paste0(combis[1], combis[u]),2] - 
       complement_data[complement_data$XY == combis[1],"Sup_X.Y"] * 
      compl_tab[combis[u],combis[u]])/
(sqrt((complement_data[complement_data$XY == combis[1],"Sup_X.Y"] * 
     (1 - complement_data[complement_data$XY == combis[1],"Sup_X.Y"]) * 
     compl_tab[combis[u],combis[u]] * (1 - compl_tab[combis[u],combis[u]])))) 

    dataXYZ <- data.frame(
Substitute = paste("{", combis[1], "}"), 
Product = paste("=>", "{", combis[u], "}"), 
Support = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2], 
Confidence = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2]/
complement_data[complement_data$XY == combis[1],"Sup_X.Y"], 
Correlation = corXYZ) 


    # conditions 
    if (dataXYZ$Correlation < pMin && dataXYZ$Support >= MinSup && dataXYZ$Confidence >= MinConf) { 

    try(rules <- rbind(rules, dataXYZ)) 
    } 
    } 
    } 
    if (nrow(rules) == 0) { 
    message("Sorry no rules could have been calculated. Maybe change input conditions.") 
    }  else { 
    return(rules) 
    } 

    # end 
} 

Je pense une meilleure explication est mon blog: http://mattimeyer.github.io/2016-12-21-Substitution-Rule-Mining/

+0

Ceci est génial homme, thx. – JEquihua