Lambdaify Kohls scraping

Move to Lambda function setup and add orchestration and word lists for full runs. Credit for the word lists goes to: http://www.desiquintans.com/nounlist
2026-03-10 10:45:04 +00:00 · 2020-10-18 17:51:08 -04:00
parent 7dbea8f805
commit 79598bf9e9
9 changed files with 6927 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,4 @@ Lambdas/Lists/target/surefire-reports/TestInputUtils.txt
 Lambdas/Lists/target/surefire-reports/TEST-TestInputUtils.xml
 Lambdas/Scraping/scraperConfigs.json
 Lambdas/Scraping/dbConfigs.json
+Lambdas/Scraping/artifacts/*
--- a/Lambdas/Scraping/KohlsScraper.py
+++ b/Lambdas/Scraping/KohlsScraper.py
@@ -1,71 +1,80 @@
 import requests
 import json
+from bs4 import BeautifulSoup
+

 import pymysql.cursors
 import time
+import re

-scraper_configs = None
-with open("scraperConfigs.json", "r") as scraper_configs_file:
-    scraper_configs = json.load(scraper_configs_file)
+def lambda_handler(event, context):
+    print(event["toScrape"])
+    scraper_configs = None
+    with open("scraperConfigs.json", "r") as scraper_configs_file:
+        scraper_configs = json.load(scraper_configs_file)

-headers = { 
-    "apikey": scraper_configs["apikey"]
-    
-}
+    headers = {
+        "apikey": scraper_configs["apikey"]
+        
+    }

-params = (
-   ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=shoes"),
-   ("location","na"),
-);
+    params = (
+       ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search="+ event["toScrape"]),
+       ("location","na"),
+    );

-response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params);
-print(response.text)
+    response = requests.get("https://app.zenscrape.com/api/v1/get", headers=headers, params=params)

-soup = BeautifulSoup(response.text, "html.parser")
+    soup = BeautifulSoup(response.text, "html.parser")

-insert_params = []
+    insert_params = []

-for match in soup.find_all(id=re.compile(".*_prod_price")):
-    price = None
-    description = ""
-    match_split = match.text.split()
-    for section in match_split:
-        if '$' in section:
-            description = ""
-            if price == None:
-                price = section
-            continue
-        if ('(' in section) or (')' in section):
-            continue
-        description += section + " "
-    description = description.strip()
-    imgUrl = ""
-    imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
-    for prior in match.previous_siblings:
-        if imgUrlBase in str(prior):
-            imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
-    print(price + " for: " + description + " @: " + imgUrl)
-    insert_params.append((3, description, float(price.split('$')[1]), imgUrl))
+    for match in soup.find_all(id=re.compile(".*_prod_price")):
+        price = None
+        description = ""
+        match_split = match.text.split()
+        for section in match_split:
+            if '$' in section:
+                description = ""
+                if price == None:
+                    price = section
+                continue
+            if ('(' in section) or (')' in section):
+                continue
+            description += section + " "
+        description = description.strip()
+        imgUrl = ""
+        imgUrlBase = "https://media.kohlsimg.com/is/image/kohls/"
+        for prior in match.previous_siblings:
+            if imgUrlBase in str(prior):
+                imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0]
+        print(price + " for: " + description + " @: " + imgUrl)
+        insert_params.append((3, description, float(price.split('$')[1]), imgUrl))

-db_configs = None
-with open("dbConfigs.json", "r") as db_configs_file:
-    db_configs = json.load(db_configs_file)
+    db_configs = None
+    with open("dbConfigs.json", "r") as db_configs_file:
+        db_configs = json.load(db_configs_file)


-connection = pymysql.connect(host=db_configs["host"],
-                             user=db_configs["user"],
-                             password=db_configs["password"],
-                             db=db_configs["db_name"],
-                             charset='utf8mb4',
-                             cursorclass=pymysql.cursors.DictCursor)
+    connection = pymysql.connect(host=db_configs["host"],
+                                 user=db_configs["user"],
+                                 password=db_configs["password"],
+                                 db=db_configs["db_name"],
+                                 charset='utf8mb4',
+                                 cursorclass=pymysql.cursors.DictCursor)

-try:
-    with connection.cursor() as cursor:
-        PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
-        cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
-        connection.commit()
-except Exception as e:
-    print(e)
-    traceback.print_exc()
-finally:
-    connection.close()
+    try:
+        with connection.cursor() as cursor:
+            PRODUCT_INSERT_SYNTAX = "INSERT INTO Product (chainID, description, price, imageURL) VALUES (%s, %s, %s, %s);"
+            cursor.executemany(PRODUCT_INSERT_SYNTAX, insert_params)
+            connection.commit()
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+    finally:
+        connection.close()
+
+    return {
+        'statusCode': 200,
+        'body': 'Scraped: ' + event["toScrape"]
+    }
--- a/Lambdas/Scraping/buildKohlsZip.sh
+++ b/Lambdas/Scraping/buildKohlsZip.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+#Currently to be run only from the Scraping directory
+OLDPWD=$(pwd)
+cd build/
+zip -r9 ${OLDPWD}/artifacts/kohlsScraper.zip .
+cd ${OLDPWD}
+zip -r9 artifacts/kohlsScraper.zip *.json
+zip -r9 artifacts/kohlsScraper.zip KohlsScraper.py
--- a/Lambdas/Scraping/nounlist.txt
+++ b/Lambdas/Scraping/nounlist.txt
--- a/Lambdas/Scraping/prefix_list_builder.py
+++ b/Lambdas/Scraping/prefix_list_builder.py
@@ -0,0 +1,25 @@
+wordlist = []
+with open("nounlist.txt") as nounlist:
+  for noun in nounlist:
+    wordlist.append(noun)
+
+prefix_list = []
+for word in wordlist:
+  prefix_list.append(word[:min(len(word), 3)])
+
+short_list = []
+short_list2 = []
+for prefix in prefix_list:
+  prefix = prefix.strip()
+  if len(short_list) < 700:
+    if (len(short_list) == 0 or short_list[-1] != prefix):
+      short_list.append(prefix)
+  else:
+    if ((len(short_list2) == 0 or short_list2[-1] != prefix) and short_list[-1] != prefix):
+      short_list2.append(prefix)
+
+with open("prefix_list_part1.txt", "w") as prefix_list_part1:
+  json.dump(short_list, prefix_list_part1)
+
+with open("prefix_list_part2.txt", "w") as prefix_list_part2:
+  json.dump(short_list2, prefix_list_part2)
--- a/Lambdas/Scraping/prefix_list_part1.txt
+++ b/Lambdas/Scraping/prefix_list_part1.txt
@@ -0,0 +1 @@
+["ATM", "CD", "SUV", "TV", "aar", "aba", "abb", "abd", "abi", "abn", "abo", "abr", "abs", "abu", "aca", "acc", "ace", "ach", "aci", "ack", "aco", "acq", "acr", "act", "acu", "ad", "ada", "add", "adj", "adm", "ado", "adr", "adu", "adv", "aff", "afo", "aft", "age", "agg", "agl", "ago", "agr", "aid", "aim", "air", "ala", "alb", "alc", "ald", "ale", "alf", "alg", "ali", "all", "alm", "alp", "alt", "alu", "ama", "amb", "ame", "amm", "amn", "amo", "amu", "ana", "anc", "and", "ane", "ang", "ani", "ank", "ann", "ano", "ans", "ant", "anx", "any", "apa", "ape", "apo", "app", "apr", "aps", "aqu", "arc", "are", "arg", "ari", "ark", "arm", "arr", "art", "asc", "ash", "asi", "asp", "ass", "ast", "asy", "ate", "ath", "atm", "ato", "atr", "att", "auc", "aud", "aun", "aut", "ava", "ave", "avo", "awa", "awe", "axi", "azi", "bab", "bac", "bad", "baf", "bag", "bai", "bak", "bal", "bam", "ban", "bao", "bar", "bas", "bat", "bay", "bea", "bec", "bed", "bee", "beg", "beh", "bei", "bel", "ben", "ber", "bes", "bet", "bev", "bey", "bia", "bib", "bic", "bid", "bif", "bij", "bik", "bil", "bin", "bio", "bip", "bir", "bis", "bit", "bla", "ble", "bli", "blo", "blu", "boa", "bob", "bod", "bog", "bol", "bom", "bon", "boo", "bor", "bos", "bot", "bou", "bow", "box", "boy", "bra", "bre", "bri", "bro", "bru", "bub", "buc", "bud", "buf", "bug", "bui", "bul", "bum", "bun", "bur", "bus", "but", "buy", "buz", "c-c", "cab", "cac", "cad", "caf", "cag", "cak", "cal", "cam", "can", "cap", "car", "cas", "cat", "cau", "cav", "cay", "cei", "cel", "cem", "cen", "cep", "cer", "ces", "cha", "che", "chi", "cho", "chr", "chu", "cic", "cig", "cil", "cin", "cir", "cit", "civ", "cla", "cle", "cli", "clo", "clu", "co-", "coa", "cob", "coc", "cod", "coe", "cof", "coh", "coi", "cok", "col", "com", "con", "coo", "cop", "cor", "cos", "cot", "cou", "cov", "cow", "coy", "cra", "cre", "cri", "cro", "cru", "cry", "cub", "cuc", "cue", "cuf", "cui", "cul", "cum", "cup", "cur", "cus", "cut", "cyc", "cyg", "cyl", "cym", "cyn", "cys", "cyt", "dad", "daf", "dag", "dah", "dai", "dam", "dan", "dar", "das", "dat", "dau", "daw", "day", "dea", "deb", "dec", "ded", "dee", "def", "deg", "del", "dem", "den", "deo", "dep", "der", "des", "det", "dev", "dew", "dho", "dia", "dib", "dic", "die", "dif", "dig", "dil", "dim", "din", "dio", "dip", "dir", "dis", "div", "doc", "doe", "dog", "doi", "dol", "dom", "don", "doo", "dor", "dos", "dot", "dou", "dow", "doz", "dra", "dre", "dri", "dro", "dru", "dry", "duc", "dud", "due", "duf", "dug", "dul", "dum", "dun", "dup", "dur", "dus", "dut", "dwa", "dwe", "dyn", "dys", "e-b", "e-m", "e-r", "eag", "ear", "eas", "eat", "eav", "ecc", "ech", "ecl", "eco", "ect", "ecu", "edd", "edg", "edi", "edu", "eel", "eff", "egg", "ego", "eic", "eje", "elb", "eld", "ele", "elf", "eli", "elk", "ell", "elm", "elo", "elv", "ema", "emb", "eme", "emi", "emo", "emp", "emu", "ena", "enc", "end", "ene", "enf", "eng", "eni", "enj", "enq", "enr", "ent", "env", "enz", "epa", "epe", "eph", "epi", "epo", "equ", "era", "ere", "ero", "err", "esc", "esp", "ess", "est", "ete", "eth", "eup", "eur", "eva", "eve", "evi", "evo", "ex-", "exa", "exc", "exe", "exh", "exi", "exo", "exp", "ext", "eye", "eyr", "fab", "fac", "fah", "fai", "fal", "fam", "fan", "far", "fas", "fat", "fau", "fav", "faw", "fax", "fea", "fed", "fee", "fel", "fem", "fen", "fer", "fes", "fet", "few", "fib", "fic", "fid", "fie", "fif", "fig", "fil", "fin", "fir", "fis", "fit", "fix", "fla", "fle", "fli", "flo", "flu", "fly", "foa", "fob", "foc", "fog", "fol", "fon", "foo", "for", "fou", "fow", "fox", "fra", "fre", "fri", "fro", "fru", "fry", "fuc", "fue", "fug", "ful", "fun", "fur", "fus", "fut", "gad", "gaf", "gai", "gal", "gam", "gan", "gap", "gar", "gas", "gat", "gau", "gav", "gaz", "gea", "gee", "gel", "gem", "gen", "geo", "ger", "ges", "gey", "ghe", "gho", "gia", "gif", "gig", "gin", "gir", "git", "gla", "gle", "gli", "glo", "glu", "gna", "gnu", "go-", "goa", "gob", "god", "gog", "goi", "gol", "gon", "goo", "gop", "gor", "gos", "gov", "gow", "gra", "gre", "gri", "gro", "gru", "gua", "gue", "gui", "gum", "gun", "gut", "guy", "gym", "gyn", "gyr", "hab", "hac", "hai", "hak", "hal", "ham", "han", "hap", "har", "has", "hat", "hau", "hav", "haw", "hay", "haz", "hea", "hec", "hed", "hee", "hei", "hel", "hem", "hen", "hep", "her", "hes", "het", "hex", "hey", "hic", "hid", "hie", "hig", "hik", "hil", "hin", "hip", "hir", "his", "hit", "hiv", "hob", "hoc", "hoe", "hog", "hol", "hom", "hon", "hoo", "hop", "hor", "hos", "hot", "hou", "hov", "how", "hub", "hug", "hul", "hum", "hun", "hur", "hus", "hut", "hya", "hyb", "hyd", "hye", "hyg", "hyp", "ice", "ici", "ico", "icy", "id", "ide", "idi", "igl", "ign", "ike", "ill", "ima", "imb", "imi", "imm", "imp", "in-", "ina", "inb", "inc", "ind", "ine", "inf", "ing", "inh", "ini", "inj", "ink", "inl", "inn", "inp", "inq", "ins", "int", "inv", "iri", "iro", "irr", "isc", "isl", "iso"]
--- a/Lambdas/Scraping/prefix_list_part2.txt
+++ b/Lambdas/Scraping/prefix_list_part2.txt
--- a/Lambdas/Scraping/runOrchestrator.py
+++ b/Lambdas/Scraping/runOrchestrator.py
@@ -0,0 +1,19 @@
+import json
+import boto3
+
+def lambda_handler(event, context):
+    with open("words.txt") as words_file:
+        words = json.load(words_file)
+        print(words)
+        for word in words:
+            client = boto3.client('lambda')
+            response = client.invoke(
+                FunctionName='KohlsScraper',
+                InvocationType="Event",
+                LogType="None",
+                Payload= """{"toScrape": \"""" + word + "\"}"
+            )
+    return {
+        'statusCode': 200,
+        'body': json.dumps('Hello from Lambda!')
+    }
--- a/Lambdas/Scraping/setupBuildFolder.sh
+++ b/Lambdas/Scraping/setupBuildFolder.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+#Currently to be run only from the Scraping directory
+mkdir build
+pip3 install --target build requests
+pip3 install --target build PyMySQL
+pip3 install --target build beautifulsoup4
+mkdir artifacts
				`@@ -0,0 +1 @@`
				["ATM", "CD", "SUV", "TV", "aar", "aba", "abb", "abd", "abi", "abn", "abo", "abr", "abs", "abu", "aca", "acc", "ace", "ach", "aci", "ack", "aco", "acq", "acr", "act", "acu", "ad", "ada", "add", "adj", "adm", "ado", "adr", "adu", "adv", "aff", "afo", "aft", "age", "agg", "agl", "ago", "agr", "aid", "aim", "air", "ala", "alb", "alc", "ald", "ale", "alf", "alg", "ali", "all", "alm", "alp", "alt", "alu", "ama", "amb", "ame", "amm", "amn", "amo", "amu", "ana", "anc", "and", "ane", "ang", "ani", "ank", "ann", "ano", "ans", "ant", "anx", "any", "apa", "ape", "apo", "app", "apr", "aps", "aqu", "arc", "are", "arg", "ari", "ark", "arm", "arr", "art", "asc", "ash", "asi", "asp", "ass", "ast", "asy", "ate", "ath", "atm", "ato", "atr", "att", "auc", "aud", "aun", "aut", "ava", "ave", "avo", "awa", "awe", "axi", "azi", "bab", "bac", "bad", "baf", "bag", "bai", "bak", "bal", "bam", "ban", "bao", "bar", "bas", "bat", "bay", "bea", "bec", "bed", "bee", "beg", "beh", "bei", "bel", "ben", "ber", "bes", "bet", "bev", "bey", "bia", "bib", "bic", "bid", "bif", "bij", "bik", "bil", "bin", "bio", "bip", "bir", "bis", "bit", "bla", "ble", "bli", "blo", "blu", "boa", "bob", "bod", "bog", "bol", "bom", "bon", "boo", "bor", "bos", "bot", "bou", "bow", "box", "boy", "bra", "bre", "bri", "bro", "bru", "bub", "buc", "bud", "buf", "bug", "bui", "bul", "bum", "bun", "bur", "bus", "but", "buy", "buz", "c-c", "cab", "cac", "cad", "caf", "cag", "cak", "cal", "cam", "can", "cap", "car", "cas", "cat", "cau", "cav", "cay", "cei", "cel", "cem", "cen", "cep", "cer", "ces", "cha", "che", "chi", "cho", "chr", "chu", "cic", "cig", "cil", "cin", "cir", "cit", "civ", "cla", "cle", "cli", "clo", "clu", "co-", "coa", "cob", "coc", "cod", "coe", "cof", "coh", "coi", "cok", "col", "com", "con", "coo", "cop", "cor", "cos", "cot", "cou", "cov", "cow", "coy", "cra", "cre", "cri", "cro", "cru", "cry", "cub", "cuc", "cue", "cuf", "cui", "cul", "cum", "cup", "cur", "cus", "cut", "cyc", "cyg", "cyl", "cym", "cyn", "cys", "cyt", "dad", "daf", "dag", "dah", "dai", "dam", "dan", "dar", "das", "dat", "dau", "daw", "day", "dea", "deb", "dec", "ded", "dee", "def", "deg", "del", "dem", "den", "deo", "dep", "der", "des", "det", "dev", "dew", "dho", "dia", "dib", "dic", "die", "dif", "dig", "dil", "dim", "din", "dio", "dip", "dir", "dis", "div", "doc", "doe", "dog", "doi", "dol", "dom", "don", "doo", "dor", "dos", "dot", "dou", "dow", "doz", "dra", "dre", "dri", "dro", "dru", "dry", "duc", "dud", "due", "duf", "dug", "dul", "dum", "dun", "dup", "dur", "dus", "dut", "dwa", "dwe", "dyn", "dys", "e-b", "e-m", "e-r", "eag", "ear", "eas", "eat", "eav", "ecc", "ech", "ecl", "eco", "ect", "ecu", "edd", "edg", "edi", "edu", "eel", "eff", "egg", "ego", "eic", "eje", "elb", "eld", "ele", "elf", "eli", "elk", "ell", "elm", "elo", "elv", "ema", "emb", "eme", "emi", "emo", "emp", "emu", "ena", "enc", "end", "ene", "enf", "eng", "eni", "enj", "enq", "enr", "ent", "env", "enz", "epa", "epe", "eph", "epi", "epo", "equ", "era", "ere", "ero", "err", "esc", "esp", "ess", "est", "ete", "eth", "eup", "eur", "eva", "eve", "evi", "evo", "ex-", "exa", "exc", "exe", "exh", "exi", "exo", "exp", "ext", "eye", "eyr", "fab", "fac", "fah", "fai", "fal", "fam", "fan", "far", "fas", "fat", "fau", "fav", "faw", "fax", "fea", "fed", "fee", "fel", "fem", "fen", "fer", "fes", "fet", "few", "fib", "fic", "fid", "fie", "fif", "fig", "fil", "fin", "fir", "fis", "fit", "fix", "fla", "fle", "fli", "flo", "flu", "fly", "foa", "fob", "foc", "fog", "fol", "fon", "foo", "for", "fou", "fow", "fox", "fra", "fre", "fri", "fro", "fru", "fry", "fuc", "fue", "fug", "ful", "fun", "fur", "fus", "fut", "gad", "gaf", "gai", "gal", "gam", "gan", "gap", "gar", "gas", "gat", "gau", "gav", "gaz", "gea", "gee", "gel", "gem", "gen", "geo", "ger", "ges", "gey", "ghe", "gho", "gia", "gif", "gig", "gin", "gir", "git", "gla", "gle", "gli", "glo", "glu", "gna", "gnu", "go-", "goa", "gob", "god", "gog", "goi", "gol", "gon", "goo", "gop", "gor", "gos", "gov", "gow", "gra", "gre", "gri", "gro", "gru", "gua", "gue", "gui", "gum", "gun", "gut", "guy", "gym", "gyn", "gyr", "hab", "hac", "hai", "hak", "hal", "ham", "han", "hap", "har", "has", "hat", "hau", "hav", "haw", "hay", "haz", "hea", "hec", "hed", "hee", "hei", "hel", "hem", "hen", "hep", "her", "hes", "het", "hex", "hey", "hic", "hid", "hie", "hig", "hik", "hil", "hin", "hip", "hir", "his", "hit", "hiv", "hob", "hoc", "hoe", "hog", "hol", "hom", "hon", "hoo", "hop", "hor", "hos", "hot", "hou", "hov", "how", "hub", "hug", "hul", "hum", "hun", "hur", "hus", "hut", "hya", "hyb", "hyd", "hye", "hyg", "hyp", "ice", "ici", "ico", "icy", "id", "ide", "idi", "igl", "ign", "ike", "ill", "ima", "imb", "imi", "imm", "imp", "in-", "ina", "inb", "inc", "ind", "ine", "inf", "ing", "inh", "ini", "inj", "ink", "inl", "inn", "inp", "inq", "ins", "int", "inv", "iri", "iro", "irr", "isc", "isl", "iso"]