Commit 2b88cb2

committed

added scripts

1 parent ac7461c commit 2b88cb2Copy full SHA for 2b88cb2

File tree

7 files changed

+386

-15

lines changed

FlipkartScraper

7 files changed

+386

-15

lines changed

`‎FlipkartScraper/dbConnector.py‎`

Lines changed: 44 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,44 @@`
	`1`	`+import sqlite3`
	`2`	`+import os`
	`3`	`+`
	`4`	`+class FlipkartDatabaseConnector:`
	`5`	`+ def __init__(self, stamp):`
	`6`	`+ self.dbPath = "flipkart.db"`
	`7`	`+ self.conn = sqlite3.connect(self.dbPath)`
	`8`	`+ self.cur = self.conn.cursor()`
	`9`	`+ self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)`
	`10`	`+`
	`11`	`+ def schemaMaker(self):`
	`12`	`+ # creating tables`
	`13`	`+ self.cur.execute("""CREATE TABLE products (`
	`14`	`+ id INTEGER PRIMARY KEY AUTOINCREMENT,`
	`15`	`+ sku TEXT NOT NULL,`
	`16`	`+ name TEXT NOT NULL,`
	`17`	`+ description TEXT NOT NULL,`
	`18`	`+ image_path TEXT NOT NULL,`
	`19`	`+ category TEXT NOT NULL,`
	`20`	`+ timestamp TEXT NOT NULL,`
	`21`	`+ URL TEXT NOT NULL,`
	`22`	`+ price TEXT NOT NULL`
	`23`	`+ );""")`
	`24`	`+ self.conn.commit()`
	`25`	`+ self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")`
	`26`	`+ self.conn.commit()`
	`27`	`+`
	`28`	`+ def insertProduct(self, productDetails):`
	`29`	`+ self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))`
	`30`	`+ self.conn.commit()`
	`31`	`+`
	`32`	`+ def fetchAllProducts(self):`
	`33`	`+ self.cur.execute("SELECT * FROM products")`
	`34`	`+ return self.cur.fetchall()`
	`35`	`+`
	`36`	`+ def clearDatabase(self):`
	`37`	`+ self.cur.execute("DELETE FROM products")`
	`38`	`+ self.conn.commit()`
	`39`	`+ self.cur.execute("DELETE FROM product_matches")`
	`40`	`+ self.conn.commit()`
	`41`	`+`
	`42`	`+ def removeDuplicates(self):`
	`43`	`+ self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")`
	`44`	`+ self.conn.commit()`

`‎FlipkartScraper/flipkart.db‎`

0 Bytes

Binary file not shown.

`‎FlipkartScraper/genricHtmlib.py‎`

Lines changed: 153 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,153 @@`
	`1`	`+from multiprocessing import Pool`
	`2`	`+import os`
	`3`	`+from datetime import datetime`
	`4`	`+import lxml.html as html`
	`5`	`+import pandas as pd`
	`6`	`+import time`
	`7`	`+from selenium import webdriver`
	`8`	`+from selenium.webdriver.chrome.options import Options`
	`9`	`+import warnings`
	`10`	`+import requests`
	`11`	`+warnings.filterwarnings("ignore")`
	`12`	`+`
	`13`	`+class SeleniumScraper:`
	`14`	`+ def __init__(self, timeout=10):`
	`15`	`+ self.timeout = timeout`
	`16`	`+ self.reqSession = requests.Session()`
	`17`	`+ self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")`
	`18`	`+ self.storagePath = os.path.join(`
	`19`	`+ os.path.dirname(os.path.abspath(__file__))`
	`20`	`+ )`
	`21`	`+`
	`22`	`+ self.headers = {`
	`23`	`+ 'authority': 'www.amazon.com',`
	`24`	`+ 'pragma': 'no-cache',`
	`25`	`+ 'cache-control': 'no-cache',`
	`26`	`+ 'dnt': '1',`
	`27`	`+ 'upgrade-insecure-requests': '1',`
	`28`	`+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',`
	`29`	`+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',`
	`30`	`+ 'sec-fetch-site': 'none',`
	`31`	`+ 'sec-fetch-mode': 'navigate',`
	`32`	`+ 'sec-fetch-dest': 'document',`
	`33`	`+ 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',`
	`34`	`+ }`
	`35`	`+`
	`36`	`+ def fetch_request_normal(self, url, params=None):`
	`37`	`+ try:`
	`38`	`+ headers = {`
	`39`	`+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"`
	`40`	`+ }`
	`41`	`+ response = self.reqSession.get(url, headers=headers)`
	`42`	`+`
	`43`	`+ if response.status_code == 200:`
	`44`	`+ return response.text`
	`45`	`+`
	`46`	`+ if response.status_code == 301:`
	`47`	`+ # retry with redirect`
	`48`	`+ response = requests.get(response.headers['Location'])`
	`49`	`+ response.raise_for_status()`
	`50`	`+ if response.status_code == 200:`
	`51`	`+ return response.text`
	`52`	`+`
	`53`	`+ if response.status_code == 503:`
	`54`	`+ #print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))`
	`55`	`+ return None`
	`56`	`+`
	`57`	`+ except Exception as e:`
	`58`	`+ print(`
	`59`	`+ "Exception occurred for url: {} and exception: {}".format(url, e)`
	`60`	`+ )`
	`61`	`+ print("Exception occurred for url: {} and exception: {}".format(url, e))`
	`62`	`+ pass`
	`63`	`+ return None`
	`64`	`+`
	`65`	`+ def get_xpath_link(self, doc, xpath, website):`
	`66`	`+ try:`
	`67`	`+ name = doc.xpath("".join(xpath))`
	`68`	`+ for i in range(len(name)):`
	`69`	`+ if name[i].startswith("/"):`
	`70`	`+ name[i] = website + name[i]`
	`71`	`+ else:`
	`72`	`+ name[i] = name[i]`
	`73`	`+ return name`
	`74`	`+`
	`75`	`+ except Exception as e:`
	`76`	`+ print("Error in getting {}: {}".format(name, e))`
	`77`	`+ pass`
	`78`	`+ return None`
	`79`	`+ pass`
	`80`	`+`
	`81`	`+ def get_selenium_driver(self):`
	`82`	`+ chrome_options = Options()`
	`83`	`+ chrome_options.add_argument("--headless")`
	`84`	`+ chrome_options.add_argument("--window-size=1920,1080")`
	`85`	`+ chrome_options.add_argument("--disable-gpu")`
	`86`	`+ chrome_options.add_argument("--no-sandbox")`
	`87`	`+ chrome_options.add_argument("--disable-dev-shm-usage")`
	`88`	`+ chrome_options.add_argument("--disable-extensions")`
	`89`	`+ chrome_options.add_argument("--disable-logging")`
	`90`	`+ chrome_options.add_argument("--log-level=3")`
	`91`	`+ chrome_options.add_argument("--silent")`
	`92`	`+ chrome_options.add_argument("--blink-settings=imagesEnabled=false")`
	`93`	`+ driver = webdriver.Chrome(chrome_options=chrome_options)`
	`94`	`+ return driver`
	`95`	`+`
	`96`	`+ def fetch_request_selenium(self, url, waiting_time=1):`
	`97`	`+ try:`
	`98`	`+ driver = self.get_selenium_driver()`
	`99`	`+ driver.get(url)`
	`100`	`+ time.sleep(waiting_time)`
	`101`	`+ doc = html.fromstring(driver.page_source)`
	`102`	`+ driver.close()`
	`103`	`+ return doc`
	`104`	`+`
	`105`	`+ except Exception as e:`
	`106`	`+ print(`
	`107`	`+ "Exception occurred for url: {} and exception: {}".format(url, e)`
	`108`	`+ )`
	`109`	`+ pass`
	`110`	`+`
	`111`	`+ def get_xpath_data(self, doc, xpath):`
	`112`	`+ try:`
	`113`	`+ name = doc.xpath(xpath)`
	`114`	`+ return name`
	`115`	`+`
	`116`	`+ except Exception as e:`
	`117`	`+ print("Error in getting {}: {}".format(name, e))`
	`118`	`+ pass`
	`119`	`+ return None`
	`120`	`+`
	`121`	`+ def slow_page_scroll(self, driver, speed):`
	`122`	`+ current_scroll_position = driver.execute_script("return window.pageYOffset;")`
	`123`	`+ while current_scroll_position < driver.execute_script(`
	`124`	`+ "return document.body.scrollHeight;"`
	`125`	`+ ):`
	`126`	`+ driver.execute_script(`
	`127`	`+ "window.scrollTo(0, arguments[0]);", current_scroll_position`
	`128`	`+ )`
	`129`	`+ current_scroll_position += 1000`
	`130`	`+ time.sleep(speed)`
	`131`	`+`
	`132`	`+ def data_storage(self, df_list, unique_id, name, storageFormat, storagePath=None):`
	`133`	`+ df_combined = pd.concat(df_list, ignore_index=True)`
	`134`	`+ df_combined.drop_duplicates(subset=unique_id, inplace=True)`
	`135`	`+ if storageFormat == "csv":`
	`136`	`+ df_combined.to_csv(`
	`137`	`+ self.storagePath +"/{}_{}.csv".format(name, self.stamp),`
	`138`	`+ index=False,`
	`139`	`+ )`
	`140`	`+ elif storageFormat == "json":`
	`141`	`+ df_combined.to_json(`
	`142`	`+ self.storagePath + "/{}_{}.json".format(name, self.stamp),`
	`143`	`+ orient="records",`
	`144`	`+ )`
	`145`	`+`
	`146`	`+ def cleanData(self, array):`
	`147`	`+ array = [x.strip() for x in array]`
	`148`	`+ array = list(filter(None, array))`
	`149`	`+ array = [x.encode("ascii", "ignore").decode() for x in array]`
	`150`	`+ array = [x.replace("\n", "") for x in array]`
	`151`	`+ return array`
	`152`	`+`
	`153`	`+`

`‎FlipkartScraper/main.py‎`

Lines changed: 156 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,156 @@`
	`1`	`+import logging`
	`2`	`+from datetime import datetime`
	`3`	`+from dbConnector import FlipkartDatabaseConnector`
	`4`	`+from productList import product_categories`
	`5`	`+from genricHtmlib import SeleniumScraper`
	`6`	`+import os`
	`7`	`+import lxml.html as html`
	`8`	`+import concurrent.futures`
	`9`	`+`
	`10`	`+SeleniumScraper = SeleniumScraper()`
	`11`	`+`
	`12`	`+class Scraper:`
	`13`	`+ def __init__(self):`
	`14`	`+ self.brand: str = "flipkart"`
	`15`	`+ self.website = "https://www.flipkart.com/search?q="`
	`16`	`+ self.websiteName = "https://www.flipkart.com"`
	`17`	`+ self.stamp: str = datetime.now().strftime("%Y-%ma-%d_%H-%M-%S")`
	`18`	`+ self.storagePath: str = os.getcwd()`
	`19`	`+`
	`20`	`+ self.productLinksXpath = '//*[@rel="noopener noreferrer"]//@href'`
	`21`	`+ self.skuXpath = '//tr[contains(@class, "row")]//td[contains(text(), "Model Number")]/following-sibling::td[1]/ul/li/text()'`
	`22`	`+ self.nameXpath = '//*[@class="B_NuCI"]//text()'`
	`23`	`+ self.description = '//div[contains(text(), "Description")]/following-sibling::div[1]/div/text()'`
	`24`	`+ self.image = '//*[@class="_396cs4 _2amPTt _3qGmMb"]//@src'`
	`25`	`+ self.category = '//*[@class="_3GIHBu"]//text()'`
	`26`	`+ self.price = '//*[@class="_30jeq3 _16Jk6d"]//text()'`
	`27`	`+`
	`28`	`+ def getProductList(self, keyword):`
	`29`	`+ try:`
	`30`	`+ productLinks = []`
	`31`	`+ url = self.website + keyword`
	`32`	`+ response = SeleniumScraper.fetch_request_normal(url)`
	`33`	`+ if response is None:`
	`34`	`+ doc = SeleniumScraper.fetch_request_selenium(url)`
	`35`	`+ else:`
	`36`	`+ doc = html.fromstring(response)`
	`37`	`+`
	`38`	`+ Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)`
	`39`	`+ productLinks.extend(Links)`
	`40`	`+`
	`41`	`+ for page in range(2, 20):`
	`42`	`+ print(f'Geting Page {page} for {keyword}')`
	`43`	`+ url = self.website + keyword + "&page=" + str(page)`
	`44`	`+ response = SeleniumScraper.fetch_request_normal(url)`
	`45`	`+ if response is None:`
	`46`	`+ doc = SeleniumScraper.fetch_request_selenium(url)`
	`47`	`+ else:`
	`48`	`+ doc = html.fromstring(response)`
	`49`	`+`
	`50`	`+ Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)`
	`51`	`+ productLinks.extend(Links)`
	`52`	`+`
	`53`	`+ print(f'Total products for {keyword} is {len(productLinks)}')`
	`54`	`+ return productLinks`
	`55`	`+`
	`56`	`+ except Exception as e:`
	`57`	`+ print(e)`
	`58`	`+`
	`59`	`+ def getProductDetails(self, productLink):`
	`60`	`+ print(f'Getting product details for {productLink}')`
	`61`	`+ response = SeleniumScraper.fetch_request_normal(productLink)`
	`62`	`+ if response is None:`
	`63`	`+ doc = SeleniumScraper.fetch_request_selenium(productLink)`
	`64`	`+ else:`
	`65`	`+ doc = html.fromstring(response)`
	`66`	`+`
	`67`	`+ productDetails = {}`
	`68`	`+`
	`69`	`+ try:`
	`70`	`+ sku = SeleniumScraper.get_xpath_data(doc ,self.skuXpath)`
	`71`	`+ sku = sku[0]`
	`72`	`+ except:`
	`73`	`+ sku = "None"`
	`74`	`+`
	`75`	`+ try:`
	`76`	`+ name = SeleniumScraper.get_xpath_data(doc ,self.nameXpath)`
	`77`	`+ name = name[0]`
	`78`	`+ except:`
	`79`	`+ name = "None"`
	`80`	`+`
	`81`	`+ try:`
	`82`	`+ description = SeleniumScraper.get_xpath_data(doc, self.description)`
	`83`	`+ description = ''.join(description)`
	`84`	`+ except:`
	`85`	`+ description = "None"`
	`86`	`+`
	`87`	`+ try:`
	`88`	`+ image_path = SeleniumScraper.get_xpath_link(doc, self.image, self.websiteName)`
	`89`	`+ image_path = image_path[0]`
	`90`	`+ except:`
	`91`	`+ image_path = "None"`
	`92`	`+`
	`93`	`+ try:`
	`94`	`+ category = SeleniumScraper.get_xpath_data(doc, self.category)`
	`95`	`+ category = category[1]`
	`96`	`+ except:`
	`97`	`+ category = "None"`
	`98`	`+`
	`99`	`+ try:`
	`100`	`+ price = SeleniumScraper.get_xpath_data(doc, self.price)`
	`101`	`+ price = SeleniumScraper.cleanData(price)`
	`102`	`+ price = price[0]`
	`103`	`+ except:`
	`104`	`+ price = "None"`
	`105`	`+`
	`106`	`+ productDetails["sku"] = str(sku)`
	`107`	`+ productDetails["name"] = str(name)`
	`108`	`+ productDetails["description"] = str(description)`
	`109`	`+ productDetails["image_path"] = str(image_path)`
	`110`	`+ productDetails["category"] = str(category)`
	`111`	`+ productDetails["timestamp"] = str(self.stamp)`
	`112`	`+ productDetails["URL"] = str(productLink)`
	`113`	`+ productDetails['price'] = price`
	`114`	`+`
	`115`	`+ print(productDetails)`
	`116`	`+ return productDetails`
	`117`	`+`
	`118`	`+ def start(self):`
	`119`	`+ productList = []`
	`120`	`+ number_of_threads: int = 1`
	`121`	`+`
	`122`	`+ # Log start of scraper`
	`123`	`+ print(f"Starting {self.brand} scraper")`
	`124`	`+`
	`125`	`+ # make db amazon.db if it doesn't exist`
	`126`	`+ if not os.path.exists(self.storagePath + "/" + self.brand + ".db"):`
	`127`	`+ print(f'Creating {self.brand}.db at {self.storagePath+self.brand+".db"}')`
	`128`	`+ db = FlipkartDatabaseConnector(self.stamp)`
	`129`	`+ db.schemaMaker()`
	`130`	`+ print(db.welcomeMessage)`
	`131`	`+`
	`132`	`+ self.db = FlipkartDatabaseConnector(self.stamp)`
	`133`	`+ print(self.db.welcomeMessage)`
	`134`	`+`
	`135`	`+ with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:`
	`136`	`+ productUrls = executor.map(self.getProductList, product_categories)`
	`137`	`+ productList.extend(productUrls)`
	`138`	`+`
	`139`	`+`
	`140`	`+ # flatten the list productList`
	`141`	`+ productList = [item for sublist in productList for item in sublist]`
	`142`	`+ print(f'Total products for {self.brand} is {len(productList)}')`
	`143`	`+`
	`144`	`+ with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:`
	`145`	`+ results = executor.map(self.getProductDetails, productList)`
	`146`	`+`
	`147`	`+ for result in results:`
	`148`	`+ print(f"Saving {result['sku']} to db")`
	`149`	`+ self.db.insertProduct(result)`
	`150`	`+`
	`151`	`+ self.db.removeDuplicates()`
	`152`	`+`
	`153`	`+`
	`154`	`+if __name__ == '__main__':`
	`155`	`+ scraper = Scraper()`
	`156`	`+ scraper.start()`

`‎FlipkartScraper/productList.py‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+product_categories = [`
	`2`	`+ 'mobiles',`
	`3`	`+]`

`‎FlipkartScraper/requirements.txt‎`

Lines changed: 0 additions & 15 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,25 +1,10 @@`
`1`		`-jupyter`
`2`		`-scikit-learn`
`3`		`-pandas`
`4`		`-numpy`
`5`		`-matplotlib`
`6`		`-seaborn`
`7`		`-tensorflow`
`8`		`-flask`
`9`		`-openai`
`10`	`1`	`bs4`
`11`	`2`	`requests`
`12`	`3`	`pandas`
`13`		`-requests`
`14`	`4`	`numpy`
`15`	`5`	`bs4`
`16`		`-geopy`
`17`	`6`	`boto3`
`18`	`7`	`ndjson`
`19`	`8`	`selenium`
`20`	`9`	`httpx`
`21`	`10`	`lxml`
`22`		`-python-dotenv`
`23`		`-paramiko`
`24`		`-undetected-chromedriver`
`25`		`-fastjsonschema`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 2b88cb2

File tree

7 files changed

7 files changed

`‎FlipkartScraper/dbConnector.py‎`

`‎FlipkartScraper/flipkart.db‎`

`‎FlipkartScraper/genricHtmlib.py‎`

`‎FlipkartScraper/main.py‎`

`‎FlipkartScraper/productList.py‎`

`‎FlipkartScraper/requirements.txt‎`

0 commit comments