From 8e6dbac183e1d88dde196f975f9160560443033a Mon Sep 17 00:00:00 2001 From: NMerz Date: Sat, 17 Oct 2020 22:56:25 -0400 Subject: [PATCH] Kohl's scraping outline Create an outline with some of the initial work needed to pull in data from Kohl's for populating the database, specifically, a non-durable extraction prototype and integration with a web requesting/scraping service --- Lambdas/Scraping/KohlsScraper.py | 37 ++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 Lambdas/Scraping/KohlsScraper.py diff --git a/Lambdas/Scraping/KohlsScraper.py b/Lambdas/Scraping/KohlsScraper.py new file mode 100644 index 0000000..f01fc9d --- /dev/null +++ b/Lambdas/Scraping/KohlsScraper.py @@ -0,0 +1,37 @@ +import requests + +headers = { + "apikey": "" +} + +params = ( + ("url","https://www.kohls.com/search.jsp?submit-search=web-regular&search=shoes"), + ("location","na"), +); + +response = requests.get('https://app.zenscrape.com/api/v1/get', headers=headers, params=params); +print(response.text) + +soup = BeautifulSoup(response.text, 'html.parser') + +for match in soup.find_all(id=re.compile(".*_prod_price")): + price = None + description = "" + match_split = match.text.split() + for section in match_split: + if '$' in section: + description = "" + if price == None: + price = section + continue + if ('(' in section) or (')' in section): + continue + description += section + " " + description = description.strip() + imgUrl = "" + imgUrlBase = 'https://media.kohlsimg.com/is/image/kohls/' + for prior in match.previous_siblings: + if imgUrlBase in str(prior): + imgUrl = imgUrlBase + str(prior).split(imgUrlBase)[1].split('?')[0].split('"')[0] + print(price + " for: " + description + " @: " + imgUrl) +