Разделение данных на переменную для ускорения "нечеткого совпадения" с использованием stringdist

Я основываюсь на ответе на предыдущий вопрос о нечетком сопоставлении с использованием stringdist.

У меня есть два больших набора данных (~ 30 тыс. Строк) с длинными строками (имена потребительских товаров), которые я хочу получить нечетким совпадением, создавая оценку расстояния. В обоих списках названий продуктов есть некоторые предполагаемые совпадения, но некоторые продукты будут уникальными для каждого списка.

Здесь проблема: мой компьютер изо всех сил пытается расширить сетку с таким количеством данных, и R продолжает рушиться. Но у меня была идея, которая могла бы помочь в оптимизации - я просто не могу заставить ее работать.

Большинство моих строк может быть bucketed на подмножество данных на основе фирменных наименований (например, pantene, neutrogena, и т.д.). Вместо того, чтобы вычислять расстояние между всеми комбинациями строк, я хотел бы grep для названия бренда, подмножества данных, а затем вычислить расстояние.

Во-первых, я использую ту же функцию, что и в предыдущем сообщении.

# Function by @C8H10N4O2    
greedyAssign <- function(a,b,d){
      x <- numeric(length(a)) # assgn variable: 0 for unassigned but assignable, 
      # 1 for already assigned, -1 for unassigned and unassignable
      while(any(x==0)){
        min_d <- min(d[x==0]) # identify closest pair, arbitrarily selecting 1st if multiple pairs
        a_sel <- a[d==min_d & x==0][1] 
        b_sel <- b[d==min_d & a == a_sel & x==0][1] 
        x[a==a_sel & b == b_sel] <- 1
        x[x==0 & (a==a_sel|b==b_sel)] <- -1
      }
      cbind(a=a[x==1],b=b[x==1],d=d[x==1])
    }

Затем я создаю список, в котором есть все мои бренды, позвольте назвать его brand_filter.

brand_filter<-c("pantene","neutrogena","maybelline", "revlon", "colour prevails", "nyx professional makeup", 
"covergirl", "no7", "milani", "japonesque", "rimmel", "thebalm", 
"physicians formula", "e.l.f.", "almay", "soap \\& glory", "l'oreal paris"
)

Затем я запускаю for loop где я фильтрую два набора данных (tar и wg) с помощью grep прежде чем расширять сетку, вычисляя расстояние между строками и применяя функцию назначения.

dput(brand_filter[1:15])
for (i in seq_along(brand_filter)) {
  d1<-tar$product.title_r[grep(brand_filter[i], tar$product.title_r)]
  d2<-wg$product.title_r[grep(brand_filter[i], wg$product.title_r)]
  d <- expand.grid(d1,d2) # Distance matrix in long form
  names(d) <- c("a_name","b_name")
  d$dist <- stringdist(d$a_name,d$b_name, method="jw") # String edit distance (use your favorite function here)
  match<-data.frame(greedyAssign(as.character(d$a_name),as.character(d$b_name),d$dist))
  all_matches<-rbind(all_matches,match)
}

Хотя эта функция работает, она очень медленная. Я уверен, что есть более умный способ!

tar$product.title_r<-
    c("neutrogena oil free moisture ultra gentle facial moisturizer", 
    "pantene pro v overnight miracle repair serum", "able life space saver walker regal rose", 
    "neutrogena acne stress control triple action toner", "e.l.f. studio eyebrow kit,light", 
    "neutrogena mineral sheers loose powder foundation,natural beige", 
    "neutrogena naturals multivitamin nourishing moisturizer lotion", 
    "neutrogena healthy skin liquid makeup spf 20,classic ivory", 
    "able life bedside sturdy rail black", "neutrogena norwegian formula moisture wrap body lotion", 
    "e.l.f. shape & stay wax pencil clear", "pantene pro v damage detox rebuilding conditioner", 
    "pantene pro v 3 minute miracle curl perfection daily conditioner", 
    "l'oreal paris visible lift radiance booster", "neutrogena triple moisture professional deep recovery hair mask", 
    "neutrogena bb cream, spf 30,light/medium", "neutrogena transparent facial bar, original formula fragrance free", 
    "pantene pro v daily moisture renewal hydrating conditioner", 
    "pantene pro v repair and protect 3 minute miracle deep conditioner", 
    "pantene pro v medium thick hair solutions shampoo", "pantene daily moisture renewal foaming conditioner", 
    "neutrogena hydro boost hydrating cleansing gel", "neutrogena rapid clear foaming scrub acne treatment", 
    "pantene radiant color shine foaming conditioner", "pantene pro v ice shine luminous conditioner", 
    "neutrogena acne stress control power clear scrub", "pantene pro v truly relaxed hair moisturizing shampoo", 
    "e.l.f. studio angled contour brush", "pantene pro v radiant color shine 2 in 1 shampoo & conditioner", 
    "neutrogena healthy skin pressed powder compact,fair 10", "paul mitchell foaming pomade", 
    "e.l.f. mad for matte eyeshadow palette,nude mood", "e.l.f. skincare starter kit", 
    "pantene pro v smooth and sleek 3 minute miracle deep conditioner", 
    "depend real fit incontinence underwear for men, maximum absorbency, small/medium, gray gray", 
    "e.l.f. pointed powder brush", "neutrogena nourishing longwear makeup, spf 20,buff", 
    "pantene pro v 2in1 medium thick hair solutions shampoo & conditioner", 
    "e.l.f. studio makeup remover cleansing cloths", "pantene pro v truly relaxed hair moisturizing shampoo", 
    "pantene pro v color hair solutions shampoo", "neutrogena makeup remover cleansing towelettes fragrance free", 
    "paul mitchell firm style freeze and shine super spray", "depend real fit incontinence briefs for men, maximum absorbency large/extra large gray & blue", 
    "neutrogena t gel therapeutic shampoo original formula", "pantene pro v gold series repairing mask treatment", 
    "neutrogena hydro boost water gel spf15", "pantene pro v full & strong flexible conditioner", 
    "neutrogena healthy skin liquid makeup honey", "lierac liftissime re lifting eye serum", 
    "e.l.f. eyebrow stencil kit clear", "neutrogena skinclearing blemish concealer,buff 09", 
    "pantene pro v beautiful lengths shampoo", "pantene pro v classic clean conditioner", 
    "e.l.f. hd mattifying balm,clear", "pantene pro v volume root lifting spray hair gel", 
    "neutrogena ageless intensives anti wrinkle deep wrinkle daily moisturizer spf 20", 
    "pantene pro v classic clean 2 in 1 shampoo & conditioner", "pantene pro v daily moisture renewal moisturizing combing cream", 
    "pantene pro v 3 minute miracle radiant color deep conditioner", 
    "e.l.f. makeup remover pen", "pantene pro v fine hair solutions 2 in 1 shampoo & conditioner", 
    "neutrogena t gel therapeutic shampoo", "e.l.f. beautifully bare luminous matte makeup primer", 
    "e.l.f. beautifully bare eyeshadow,nude linen", "pantene pro v volume texturizing non aerosol hairspray", 
    "pantene pro v breakage defense conditioner", "ardell fashion lashes", 
    "neutrogena oil free eye makeup remover", "pantene pro v gold series moisture boost shampoo", 
    "neutrogena oil free acne wash pink grapefruit facial cleanser", 
    "e.l.f. studio cream eyeliner,black", "neutrogena makeup remover cleansing towelettes", 
    "neutrogena all in 1 acne control daily scrub", "neutrogena ultra gentle hydrating cleanser, creamy formula", 
    "neutrogena oil free acne wash cream cleanser pink grapefruit", 
    "e.l.f. aqua beauty primer mist clear", "e.l.f. mineral infused face primer", 
    "pantene pro v gold series deep hydrating co wash", "e.l.f. studio angled foundation brush", 
    "neutrogena oil free cleansing wipes pink grapefruit", "e.l.f. perfect finish hd powder,clear", 
    "neutrogena norwegian formula hand cream scented", "able life ez door knob grips", 
    "neutrogena body clear body scrub, salicylic acid acne treatment", 
    "depend real fit for men briefs maximum absorbency small/medium gray", 
    "depend real fit incontinence underwear for men, maximum absorbency, large/xlarge", 
    "neutrogena oil free facial moisturizer lotion spf 35", "special k cereal fruit & yogurt", 
    "teatrical stem cell facial moisturizer", "e.l.f. beautifully bare sheer tint finishing powder,light/medium", 
    "neutrogena triple moisture professional daily deep conditioner", 
    "e.l.f. studio kabuki face brush", "l'oreal paris visible lift radiance cheek duo,201 romantic in rose", 
    "e.l.f. studio blush,candid coral", "pantene pro v repair & protect shampoo", 
    "neutrogena 3 in 1 concealer for eyes,light", "neutrogena healthy skin compact makeup spf 55,classic ivory", 
    "able life auto assist grab bar", "e.l.f. beautifully bare foundation serum spf 25,fair/light"
    )

wg$product.title_r<-c("pantene curl perfection leave in conditioning spray   8.5 fl oz", 
"neutrogena shine control powder", "neutrogena ultra sheer body mist sunscreen broad spectrum   spf 100+   5oz", 
"neutrogena healthy lengths mascara", "pantene pro v gold series repairing mask   7.6oz", 
"neutrogena healthy skin blends", "pantene shaping hair gel 6.8 fl oz", 
"pantene pro v repair and protect dream care conditioner   23.7 fl oz", 
"pantene pro v air spray extra hold alcohol free hairspray   7oz", 
"pantene pro v sheer volume dream care conditioner   23.7 fl oz", 
"pantene pro v curl perfection moisturizing shampoo   20.1oz", 
"neutrogena hydro boost hydrating lip shine   0.12oz", "pantene 3 minute miracle sheer volume deep conditioner   8oz", 
"pantene curl perfection controlling curl crme   7.6oz", "neutrogena moisture shine lip soothers spf 20", 
"neutrogena t/sal therapeutic shampoo scalp build up control   4.5oz", 
"pantene pro v color preserve volume conditioner   17.7oz", "neutrogena healthy skin glow sheers   light shades   1.1 fl oz", 
"pantene pro v sheer volume shampoo", "pantene curl perfection conditioner   12.6 fl oz", 
"pantene pro v classic clean dream care conditioner   23.7 fl oz", 
"pantene pro v beautiful lengths dream care conditioner   23.7 fl oz", 
"pantene pro v smooth and sleek dream care shampoo   25 fl oz", 
"pantene sheer volume foam conditioner   6oz", "pantene pro v ultimate 10 bb shampoo", 
"pantene pro v radiant color shine dream care shampoo   25 fl oz", 
"pantene pro v curl enhancing spray gel 5.7 fl oz", "pantene extra strong hold level 4 hold hairspray   11oz", 
"neutrogena ultra sheer broad spectrum sunscreen body mist   spf 30   5oz", 
"pantene pro v classic clean dream care shampoo   25 fl oz", 
"neutrogena healthy volume mascara", "neutrogena hydro boost hydrating concealer", 
"neutrogena men skin clearing acne wash   3 pk", "pantene pro v daily moisture renewal dream care shampoo   25 fl oz", 
"pantene daily moisture renewal shampoo", "pantene daily moisture renewal hair shampoo   travel size  3.38 fl oz", 
"neutrogena anti residue gentle clarifying shampoo   6 fl oz", 
"pantene radiant colour shine foam conditioner   6oz", "neutrogena hydro boost plumping mascara", 
"pantene pro v curl perfection moisturizing conditioner   17.7oz", 
"neutrogena healthy skin liquid makeup   fair shades   1 fl oz", 
"pantene pro v radiant color shine dream care conditioner   23.7 fl oz", 
"pantene pro v micellar shampoo   17.9 fl oz", "neutrogena wet skin sunscreen spray broad spectrum   spf 50   5 fl oz", 
"pantene pro v beautiful lengths shampoo", "pantene pro v ultimate 10 bb conditioner", 
"neutrogena mineral sheers compact powder", "neutrogena healthy skin anti aging perfector", 
"pantene pro v micellar shampoo   10.1 fl oz", "neutrogena build a tan lotion   6.7oz"
)

Ответы

Ответ 1

Я мог бы что-то упустить, но greedyAssign кажется более сложным, чем необходимо. Например, даже используя базовую версию R для нечеткого соответствия (функцию adist), можно получить более векторный код.

fuzzy.matcher = function(a,b) {
 dists<- adist(a,b) # calculate the distance matrix.
 simi <- -dists     # converts it to a similarity matrix
 bestbyindex <- max.col(simi)  
 matches     <- cbind( a, b[bestbyindex], apply(simi,1,max) )
 return(matches) 
}

Этот процесс сохраняет цикл while, заменяя его на одно приложение, которое является необязательным (оно возвращает значение расстояния, поэтому оно полезно только для целей отладки).

Вторая функция, предназначенная для балансировки данных, также может быть упрощена. Мы можем заменить grep in для циклов на pmatch, что происходит быстрее. Единственным недостатком является предположение, что один продукт имеет только один бренд.

  brand.index.tar = pmatch( tar$product.title_r, brand_filter)
  brand.index.wg  = pmatch( wg$product.title_r, brand_filter)
  split.tar  = split(tar$product.title_r, brand.index.tar) # Separate brand names in different data.frames.
  split.wg  =  split(wg$product.title_r, brand.index.wg)
  mapply(fuzzy.matcher, split.tar, split.wg)

Таким образом, единственное, что внутри цикла fuzzy.matcher(), во внутреннем цикле mapply().

У меня нет количественных доказательств того, что это сократит время вычислений. Однако этот код гораздо более векторный, чем предыдущая итерация, поэтому он должен быть быстрее.

Ответ 2

Функция JMenezes работает лучше старой из-за отсутствия явного цикла и использования векторизованных функций. Я провел тестовый тест, и вы должны получить ускорение в 100 или более раз.

JMenezes_function <- function(){
  tar <- list()
  wg <- list()
  tar$product.title_r<-
    c("neutrogena oil free moisture ultra gentle facial moisturizer", 
      "pantene pro v overnight miracle repair serum", "able life space saver walker regal rose", 
      "neutrogena acne stress control triple action toner", "e.l.f. studio eyebrow kit,light", 
      "neutrogena mineral sheers loose powder foundation,natural beige", 
      "neutrogena naturals multivitamin nourishing moisturizer lotion", 
      "neutrogena healthy skin liquid makeup spf 20,classic ivory", 
      "able life bedside sturdy rail black", "neutrogena norwegian formula moisture wrap body lotion", 
      "e.l.f. shape & stay wax pencil clear", "pantene pro v damage detox rebuilding conditioner", 
      "pantene pro v 3 minute miracle curl perfection daily conditioner", 
      "l'oreal paris visible lift radiance booster", "neutrogena triple moisture professional deep recovery hair mask", 
      "neutrogena bb cream, spf 30,light/medium", "neutrogena transparent facial bar, original formula fragrance free", 
      "pantene pro v daily moisture renewal hydrating conditioner", 
      "pantene pro v repair and protect 3 minute miracle deep conditioner", 
      "pantene pro v medium thick hair solutions shampoo", "pantene daily moisture renewal foaming conditioner", 
      "neutrogena hydro boost hydrating cleansing gel", "neutrogena rapid clear foaming scrub acne treatment", 
      "pantene radiant color shine foaming conditioner", "pantene pro v ice shine luminous conditioner", 
      "neutrogena acne stress control power clear scrub", "pantene pro v truly relaxed hair moisturizing shampoo", 
      "e.l.f. studio angled contour brush", "pantene pro v radiant color shine 2 in 1 shampoo & conditioner", 
      "neutrogena healthy skin pressed powder compact,fair 10", "paul mitchell foaming pomade", 
      "e.l.f. mad for matte eyeshadow palette,nude mood", "e.l.f. skincare starter kit", 
      "pantene pro v smooth and sleek 3 minute miracle deep conditioner", 
      "depend real fit incontinence underwear for men, maximum absorbency, small/medium, gray gray", 
      "e.l.f. pointed powder brush", "neutrogena nourishing longwear makeup, spf 20,buff", 
      "pantene pro v 2in1 medium thick hair solutions shampoo & conditioner", 
      "e.l.f. studio makeup remover cleansing cloths", "pantene pro v truly relaxed hair moisturizing shampoo", 
      "pantene pro v color hair solutions shampoo", "neutrogena makeup remover cleansing towelettes fragrance free", 
      "paul mitchell firm style freeze and shine super spray", "depend real fit incontinence briefs for men, maximum absorbency large/extra large gray & blue", 
      "neutrogena t gel therapeutic shampoo original formula", "pantene pro v gold series repairing mask treatment", 
      "neutrogena hydro boost water gel spf15", "pantene pro v full & strong flexible conditioner", 
      "neutrogena healthy skin liquid makeup honey", "lierac liftissime re lifting eye serum", 
      "e.l.f. eyebrow stencil kit clear", "neutrogena skinclearing blemish concealer,buff 09", 
      "pantene pro v beautiful lengths shampoo", "pantene pro v classic clean conditioner", 
      "e.l.f. hd mattifying balm,clear", "pantene pro v volume root lifting spray hair gel", 
      "neutrogena ageless intensives anti wrinkle deep wrinkle daily moisturizer spf 20", 
      "pantene pro v classic clean 2 in 1 shampoo & conditioner", "pantene pro v daily moisture renewal moisturizing combing cream", 
      "pantene pro v 3 minute miracle radiant color deep conditioner", 
      "e.l.f. makeup remover pen", "pantene pro v fine hair solutions 2 in 1 shampoo & conditioner", 
      "neutrogena t gel therapeutic shampoo", "e.l.f. beautifully bare luminous matte makeup primer", 
      "e.l.f. beautifully bare eyeshadow,nude linen", "pantene pro v volume texturizing non aerosol hairspray", 
      "pantene pro v breakage defense conditioner", "ardell fashion lashes", 
      "neutrogena oil free eye makeup remover", "pantene pro v gold series moisture boost shampoo", 
      "neutrogena oil free acne wash pink grapefruit facial cleanser", 
      "e.l.f. studio cream eyeliner,black", "neutrogena makeup remover cleansing towelettes", 
      "neutrogena all in 1 acne control daily scrub", "neutrogena ultra gentle hydrating cleanser, creamy formula", 
      "neutrogena oil free acne wash cream cleanser pink grapefruit", 
      "e.l.f. aqua beauty primer mist clear", "e.l.f. mineral infused face primer", 
      "pantene pro v gold series deep hydrating co wash", "e.l.f. studio angled foundation brush", 
      "neutrogena oil free cleansing wipes pink grapefruit", "e.l.f. perfect finish hd powder,clear", 
      "neutrogena norwegian formula hand cream scented", "able life ez door knob grips", 
      "neutrogena body clear body scrub, salicylic acid acne treatment", 
      "depend real fit for men briefs maximum absorbency small/medium gray", 
      "depend real fit incontinence underwear for men, maximum absorbency, large/xlarge", 
      "neutrogena oil free facial moisturizer lotion spf 35", "special k cereal fruit & yogurt", 
      "teatrical stem cell facial moisturizer", "e.l.f. beautifully bare sheer tint finishing powder,light/medium", 
      "neutrogena triple moisture professional daily deep conditioner", 
      "e.l.f. studio kabuki face brush", "l'oreal paris visible lift radiance cheek duo,201 romantic in rose", 
      "e.l.f. studio blush,candid coral", "pantene pro v repair & protect shampoo", 
      "neutrogena 3 in 1 concealer for eyes,light", "neutrogena healthy skin compact makeup spf 55,classic ivory", 
      "able life auto assist grab bar", "e.l.f. beautifully bare foundation serum spf 25,fair/light"
    )

  wg$product.title_r<-c("pantene curl perfection leave in conditioning spray   8.5 fl oz", 
                        "neutrogena shine control powder", "neutrogena ultra sheer body mist sunscreen broad spectrum   spf 100+   5oz", 
                        "neutrogena healthy lengths mascara", "pantene pro v gold series repairing mask   7.6oz", 
                        "neutrogena healthy skin blends", "pantene shaping hair gel 6.8 fl oz", 
                        "pantene pro v repair and protect dream care conditioner   23.7 fl oz", 
                        "pantene pro v air spray extra hold alcohol free hairspray   7oz", 
                        "pantene pro v sheer volume dream care conditioner   23.7 fl oz", 
                        "pantene pro v curl perfection moisturizing shampoo   20.1oz", 
                        "neutrogena hydro boost hydrating lip shine   0.12oz", "pantene 3 minute miracle sheer volume deep conditioner   8oz", 
                        "pantene curl perfection controlling curl crme   7.6oz", "neutrogena moisture shine lip soothers spf 20", 
                        "neutrogena t/sal therapeutic shampoo scalp build up control   4.5oz", 
                        "pantene pro v color preserve volume conditioner   17.7oz", "neutrogena healthy skin glow sheers   light shades   1.1 fl oz", 
                        "pantene pro v sheer volume shampoo", "pantene curl perfection conditioner   12.6 fl oz", 
                        "pantene pro v classic clean dream care conditioner   23.7 fl oz", 
                        "pantene pro v beautiful lengths dream care conditioner   23.7 fl oz", 
                        "pantene pro v smooth and sleek dream care shampoo   25 fl oz", 
                        "pantene sheer volume foam conditioner   6oz", "pantene pro v ultimate 10 bb shampoo", 
                        "pantene pro v radiant color shine dream care shampoo   25 fl oz", 
                        "pantene pro v curl enhancing spray gel 5.7 fl oz", "pantene extra strong hold level 4 hold hairspray   11oz", 
                        "neutrogena ultra sheer broad spectrum sunscreen body mist   spf 30   5oz", 
                        "pantene pro v classic clean dream care shampoo   25 fl oz", 
                        "neutrogena healthy volume mascara", "neutrogena hydro boost hydrating concealer", 
                        "neutrogena men skin clearing acne wash   3 pk", "pantene pro v daily moisture renewal dream care shampoo   25 fl oz", 
                        "pantene daily moisture renewal shampoo", "pantene daily moisture renewal hair shampoo   travel size  3.38 fl oz", 
                        "neutrogena anti residue gentle clarifying shampoo   6 fl oz", 
                        "pantene radiant colour shine foam conditioner   6oz", "neutrogena hydro boost plumping mascara", 
                        "pantene pro v curl perfection moisturizing conditioner   17.7oz", 
                        "neutrogena healthy skin liquid makeup   fair shades   1 fl oz", 
                        "pantene pro v radiant color shine dream care conditioner   23.7 fl oz", 
                        "pantene pro v micellar shampoo   17.9 fl oz", "neutrogena wet skin sunscreen spray broad spectrum   spf 50   5 fl oz", 
                        "pantene pro v beautiful lengths shampoo", "pantene pro v ultimate 10 bb conditioner", 
                        "neutrogena mineral sheers compact powder", "neutrogena healthy skin anti aging perfector", 
                        "pantene pro v micellar shampoo   10.1 fl oz", "neutrogena build a tan lotion   6.7oz"
  )

  brand_filter<-c("pantene","neutrogena","maybelline", "revlon", "colour prevails", "nyx professional makeup", 
                  "covergirl", "no7", "milani", "japonesque", "rimmel", "thebalm", 
                  "physicians formula", "e.l.f.", "almay", "soap \\& glory", "l'oreal paris"
  )
  fuzzy.matcher = function(a,b) {
    dists<- adist(a,b) # calculate the distance matrix.
    simi <- -dists     # converts it to a similarity matrix
    bestbyindex <- max.col(simi)  
    matches     <- cbind( a, b[bestbyindex], apply(simi,1,max) )
    return(matches) 
  }
  brand.index.tar = pmatch( tar$product.title_r, brand_filter)
  brand.index.wg  = pmatch( wg$product.title_r, brand_filter)
  split.tar  = split(tar$product.title_r, brand.index.tar) # Separate brand names in different data.frames.
  split.wg  =  split(wg$product.title_r, brand.index.wg)
  mapply(fuzzy.matcher, split.tar, split.wg)

}

microbenchmark::microbenchmark(old_function(), JMenezes_function(), 
                               times = 50, unit = "s")
Unit: seconds
                expr         min          lq         mean      median
      old_function() 0.019502631 0.025619695 0.0290760771 0.028003105
 JMenezes_function() 0.000135669 0.000143731 0.0002068144 0.000193383
          uq         max neval
 0.029421126 0.052666887    50
 0.000265149 0.000353758    50