Commit f8f6199

committed

Add parser class from Banyan and the lawyer profile gather Scrapy project.

1 parent c107d6a commit f8f6199Copy full SHA for f8f6199

File tree

13 files changed

+1252

-0

lines changed

code-examples
- banyan_parser.py
- laywer_profile_gather
  - lp
  - scrapy.cfg
slides
- pyparsing-dcpython-april-2015.key

13 files changed

+1252

-0

lines changed

`‎code-examples/banyan_parser.py‎`

Lines changed: 122 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,122 @@`
	`1`	`+#!/usr/bin/env python`
	`2`	`+# encoding: utf-8`
	`3`	`+"""`
	`4`	`+BanyanParser.py`
	`5`	`+Created by Robert Dempsey on 1/23/15.`
	`6`	`+Copyright (c) 2015 Robert Dempsey. All rights reserved.`
	`7`	`+`
	`8`	`+Taken from Banyan: https://github.com/rdempsey/banyan`
	`9`	`+"""`
	`10`	`+`
	`11`	`+from pyparsing import *`
	`12`	`+from bin.LocalFile import *`
	`13`	`+from bin.Mailer import *`
	`14`	`+from bin.WebSearch import *`
	`15`	`+from bin.LocalProject import *`
	`16`	`+`
	`17`	`+lower = str.lower`
	`18`	`+`
	`19`	`+`
	`20`	`+class BanyanParser:`
	`21`	`+ def __init__(self, **kwargs):`
	`22`	`+ self.properties = kwargs`
	`23`	`+`
	`24`	`+ # Input`
	`25`	`+ @property`
	`26`	`+ def input(self):`
	`27`	`+ return self.properties.get('input', 'None')`
	`28`	`+`
	`29`	`+ @input.setter`
	`30`	`+ def input(self, s):`
	`31`	`+ self.properties['input'] = s`
	`32`	`+`
	`33`	`+ @input.deleter`
	`34`	`+ def input(self):`
	`35`	`+ del self.properties['input']`
	`36`	`+`
	`37`	`+ def parse(self):`
	`38`	`+ """`
	`39`	`+ Commands`
	`40`	`+ word :: group of alphabetic characters`
	`41`	`+ command :: the first word of the sentence`
	`42`	`+ command_object :: what the command needs to do`
	`43`	`+`
	`44`	`+ Questions`
	`45`	`+ question :: the first word of the sentence begins with a contraction`
	`46`	`+ """`
	`47`	`+`
	`48`	`+ # Put the input into a string`
	`49`	`+ input = self.input`
	`50`	`+`
	`51`	`+ # Parse Actions`
	`52`	`+ join_tokens = lambda tokens : " ".join(tokens)`
	`53`	`+`
	`54`	`+ # Define grammar`
	`55`	`+ comma = Literal(",").suppress()`
	`56`	`+ command = oneOf("check Check create Create open Open search Search get Get email Email tweet Tweet")`
	`57`	`+ act_on = oneOf("project file web locally")`
	`58`	`+ command_object = OneOrMore(Word(alphas+"'."))`
	`59`	`+ what_time = oneOf("current today's tomorrow's")`
	`60`	`+ subject = Literal("subject")`
	`61`	`+`
	`62`	`+ # Assign parse actions`
	`63`	`+ command_object.setParseAction(join_tokens)`
	`64`	`+`
	`65`	`+ # Commands`
	`66`	`+ create_open_search = command("command") + act_on("act_on") + command_object("name")`
	`67`	`+ get = command("command") + what_time("time") + command_object("object")`
	`68`	`+ email = command("command") + command_object("email_to") + comma + subject + command_object("email_subject")`
	`69`	`+ tweet = command("command") + command_object("tweet")`
	`70`	`+ launch_check = command("command") + command_object("app")`
	`71`	`+`
	`72`	`+ try:`
	`73`	`+ w = command.parseString(input)`
	`74`	`+ w_command = lower(w[0])`
	`75`	`+ if w_command == "create":`
	`76`	`+ c = create_open_search.parseString(input)`
	`77`	`+ if c.act_on == "project":`
	`78`	`+ system("say Shall I store the project in a private repo?")`
	`79`	`+ save_in_github = raw_input("Save in Github > ")`
	`80`	`+ p = LocalProject()`
	`81`	`+ p.create_new_project(c.name, save_in_github)`
	`82`	`+ elif c.act_on == "file":`
	`83`	`+ #TODO: add create file`
	`84`	`+ pass`
	`85`	`+ elif w_command == "check":`
	`86`	`+ chk = launch_check.parseString(input)`
	`87`	`+ if chk.app == "email":`
	`88`	`+ SayGmailCount().start()`
	`89`	`+ SayADSCount().start()`
	`90`	`+ SayDC2Count().start()`
	`91`	`+ elif w_command == "open":`
	`92`	`+ c = create_open_search.parseString(input)`
	`93`	`+ if c.act_on == "file":`
	`94`	`+ f = LocalFile()`
	`95`	`+ f.open_file(c.name)`
	`96`	`+ elif c.act_on == "project":`
	`97`	`+ #TODO: add open project`
	`98`	`+ pass`
	`99`	`+ else:`
	`100`	`+ pass`
	`101`	`+ elif w_command == "search":`
	`102`	`+ s = create_open_search.parseString(input)`
	`103`	`+ if s.act_on == "web":`
	`104`	`+ ws = WebSearch()`
	`105`	`+ ws.perform_search(s.name)`
	`106`	`+ elif w_command == "email":`
	`107`	`+ e = email.parseString(input)`
	`108`	`+ print("email: {}, subject: {}".format(e.email_to, e.email_subject))`
	`109`	`+ elif w_command == "tweet":`
	`110`	`+ #TODO: add tweeting`
	`111`	`+ # t = tweet.parseString(input)`
	`112`	`+ pass`
	`113`	`+ else:`
	`114`	`+ print("I don't know what you want me to do...")`
	`115`	`+ except Exception as e:`
	`116`	`+ system("say Please enter a valid command")`
	`117`	`+ print("Error: {}".format(str(e)))`
	`118`	`+`
	`119`	`+`
	`120`	`+`
	`121`	`+if __name__ == '__main__':`
	`122`	`+ pass`

`‎code-examples/laywer_profile_gather/lp/init.py‎`

Whitespace-only changes.

`‎code-examples/laywer_profile_gather/lp/items.py‎`

Lines changed: 40 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,40 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+`
	`3`	`+# Define here the models for your scraped items`
	`4`	`+#`
	`5`	`+# See documentation in:`
	`6`	`+# http://doc.scrapy.org/en/latest/topics/items.html`
	`7`	`+`
	`8`	`+import scrapy`
	`9`	`+`
	`10`	`+`
	`11`	`+class LpItem(scrapy.Item):`
	`12`	`+ # define the fields for your item here like:`
	`13`	`+ name = scrapy.Field()`
	`14`	`+ address_locality = scrapy.Field()`
	`15`	`+ address_region = scrapy.Field()`
	`16`	`+ job_title = scrapy.Field()`
	`17`	`+ profile_link = scrapy.Field()`
	`18`	`+ pass`
	`19`	`+`
	`20`	`+class LpProfile(scrapy.Item):`
	`21`	`+ profile_url = scrapy.Field()`
	`22`	`+ name = scrapy.Field()`
	`23`	`+ firm = scrapy.Field()`
	`24`	`+ firm_address_1 = scrapy.Field()`
	`25`	`+ firm_address_2 = scrapy.Field()`
	`26`	`+ firm_city = scrapy.Field()`
	`27`	`+ firm_state = scrapy.Field()`
	`28`	`+ firm_zipcode = scrapy.Field()`
	`29`	`+ firm_phone_1 = scrapy.Field()`
	`30`	`+ firm_phone_2 = scrapy.Field()`
	`31`	`+ firm_fax = scrapy.Field()`
	`32`	`+ firm_url = scrapy.Field()`
	`33`	`+ position = scrapy.Field()`
	`34`	`+ practice_area_1 = scrapy.Field()`
	`35`	`+ practice_area_2 = scrapy.Field()`
	`36`	`+ practice_area_3 = scrapy.Field()`
	`37`	`+ practice_area_4 = scrapy.Field()`
	`38`	`+ practice_area_5 = scrapy.Field()`
	`39`	`+ practice_area_6 = scrapy.Field()`
	`40`	`+ pass`

`‎code-examples/laywer_profile_gather/lp/middlewares.py‎`

Lines changed: 13 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,13 @@`
	`1`	`+import os`
	`2`	`+import random`
	`3`	`+from scrapy.conf import settings`
	`4`	`+from fake_useragent import UserAgent`
	`5`	`+`
	`6`	`+`
	`7`	`+class RandomUserAgentMiddleware(object):`
	`8`	`+ def __init__(self):`
	`9`	`+ super(RandomUserAgentMiddleware, self).__init__()`
	`10`	`+ self.ua = UserAgent()`
	`11`	`+`
	`12`	`+ def process_request(self, request, spider):`
	`13`	`+ request.headers.setdefault('User-Agent', self.ua.random)`

`‎code-examples/laywer_profile_gather/lp/pipelines.py‎`

Lines changed: 56 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,56 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+`
	`3`	`+# Define your item pipelines here`
	`4`	`+#`
	`5`	`+# Don't forget to add your pipeline to the ITEM_PIPELINES setting`
	`6`	`+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html`
	`7`	`+`
	`8`	`+from scrapy.conf import settings`
	`9`	`+from scrapy.exceptions import DropItem`
	`10`	`+from scrapy import log`
	`11`	`+import csv`
	`12`	`+import string`
	`13`	`+import unicodedata`
	`14`	`+from lp import settings`
	`15`	`+`
	`16`	`+exclude = set(string.punctuation)`
	`17`	`+`
	`18`	`+`
	`19`	`+def fully_clean_item(x):`
	`20`	`+ """`
	`21`	`+ Helper function to perform all the available cleanings`
	`22`	`+ """`
	`23`	`+ # Remove unicode characters`
	`24`	`+ x = str(x.encode('utf8'))`
	`25`	`+ # Remove extra commas`
	`26`	`+ x = x.replace(",","")`
	`27`	`+ # Remove extra whitespace from the front and back`
	`28`	`+ x = x.strip()`
	`29`	`+ # Remove all extra line breaks and tabs`
	`30`	`+ x = x.replace("\n","")`
	`31`	`+ x = x.replace("\t","")`
	`32`	`+ return x`
	`33`	`+`
	`34`	`+`
	`35`	`+def write_to_csv(item):`
	`36`	`+ writer = csv.writer(open(settings.lawyer_profile_links_file, 'a'), lineterminator='\n')`
	`37`	`+ writer.writerow([item[key] for key in item.keys()])`
	`38`	`+`
	`39`	`+`
	`40`	`+def write_to_profiles_csv(item):`
	`41`	`+ writer = csv.writer(open(settings.lawyer_profiles_file, 'a'), lineterminator='\n')`
	`42`	`+ writer.writerow([item[key] for key in item.keys()])`
	`43`	`+`
	`44`	`+`
	`45`	`+class LpPipeline(object):`
	`46`	`+ def process_item(self, item, spider):`
	`47`	`+ write_to_csv(item)`
	`48`	`+ return item`
	`49`	`+`
	`50`	`+`
	`51`	`+class ProfilesPipeline(object):`
	`52`	`+ def process_item(self, item, spider):`
	`53`	`+ for key in item.keys():`
	`54`	`+ item[key] = fully_clean_item(item[key])`
	`55`	`+ write_to_profiles_csv(item)`
	`56`	`+ return item`

`‎code-examples/laywer_profile_gather/lp/settings.py‎`

Lines changed: 38 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,38 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+`
	`3`	`+# Scrapy settings for lp project`
	`4`	`+#`
	`5`	`+# For simplicity, this file contains only the most important settings by`
	`6`	`+# default. All the other settings are documented here:`
	`7`	`+#`
	`8`	`+# http://doc.scrapy.org/en/latest/topics/settings.html`
	`9`	`+#`
	`10`	`+`
	`11`	`+BOT_NAME = 'lp'`
	`12`	`+`
	`13`	`+SPIDER_MODULES = ['lp.spiders']`
	`14`	`+NEWSPIDER_MODULE = 'lp.spiders'`
	`15`	`+`
	`16`	`+# Be nice to the sites we're crawling`
	`17`	`+AUTOTHROTTLE_ENABLED = True`
	`18`	`+AUTOTHROTTLE_START_DELAY = 5.0`
	`19`	`+AUTOTHROTTLE_MAX_DELAY = 60.0`
	`20`	`+DOWNLOAD_DELAY = 5`
	`21`	`+`
	`22`	`+# Disable cookies`
	`23`	`+COOKIES_ENABLED = False`
	`24`	`+`
	`25`	`+# Define which pipeline we'll use`
	`26`	`+ITEM_PIPELINES = ['lp.pipelines.ProfilesPipeline']`
	`27`	`+`
	`28`	`+# On the Pi`
	`29`	`+start_urls_file = '/home/pi/Dev/profile_gather/lp/data/start_urls.csv'`
	`30`	`+lawyer_profile_links_file = '/home/pi/Dev/profile_gather/lp/data/lawyer_profile_urls_updated.csv'`
	`31`	`+lawyer_profiles_file = '/home/pi/Dev/profile_gather/lp/data/lawyer_profiles.csv'`
	`32`	`+`
	`33`	`+# Use the fake_useragent python library`
	`34`	`+# Requires you to: pip install fake-useragent`
	`35`	`+DOWNLOADER_MIDDLEWARES = {`
	`36`	`+ 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,`
	`37`	`+ 'lp.middlewares.RandomUserAgentMiddleware': 400,`
	`38`	`+}`

`‎code-examples/laywer_profile_gather/lp/spiders/init.py‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,4 @@`
	`1`	`+# This package will contain the spiders of your Scrapy project`
	`2`	`+#`
	`3`	`+# Please refer to the documentation for information on how to create and manage`
	`4`	`+# your spiders.`

`‎code-examples/laywer_profile_gather/lp/spiders/lp_spider.py‎`

Lines changed: 34 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,34 @@`
	`1`	`+from scrapy import Spider`
	`2`	`+from scrapy.selector import Selector`
	`3`	`+from lp.items import LpItem`
	`4`	`+from lp import settings`
	`5`	`+import csv`
	`6`	`+`
	`7`	`+`
	`8`	`+class LpSpider(Spider):`
	`9`	`+ name = "lp"`
	`10`	`+ allowed_domains = ["MAIN_URL_HERE"]`
	`11`	`+ start_urls = []`
	`12`	`+`
	`13`	`+ with open(settings.start_urls_file, 'r') as csvfile:`
	`14`	`+ data = csv.reader(csvfile)`
	`15`	`+ for row in data:`
	`16`	`+ for column in row:`
	`17`	`+ start_urls.append(column)`
	`18`	`+`
	`19`	`+ def parse(self, response):`
	`20`	`+ lawyers = Selector(response).xpath('//div[@class="block_content"]/a')`
	`21`	`+`
	`22`	`+ for lawyer in lawyers:`
	`23`	`+ item = LpItem()`
	`24`	`+ item['profile_link'] = lawyer.xpath(`
	`25`	`+ '@href').extract()[0]`
	`26`	`+ item['name'] = lawyer.xpath(`
	`27`	`+ 'div[@class="bp_listings_result_header"]/text()').extract()[0]`
	`28`	`+ item['address_locality'] = lawyer.xpath(`
	`29`	`+ 'div[@class="bp_listings_result_address"]/span[@itemprop="addressLocality"]/text()').extract()[0]`
	`30`	`+ item['address_region'] = lawyer.xpath(`
	`31`	`+ 'div[@class="bp_listings_result_address"]/span[@itemprop="addressRegion"]/text()').extract()[0]`
	`32`	`+ item['job_title'] = lawyer.xpath(`
	`33`	`+ 'div[@class="bp_listings_result_description"]/text()').extract()[0]`
	`34`	`+ yield item`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit f8f6199

File tree

13 files changed

13 files changed

`‎code-examples/banyan_parser.py‎`

`‎code-examples/laywer_profile_gather/lp/init.py‎`

`‎code-examples/laywer_profile_gather/lp/items.py‎`

`‎code-examples/laywer_profile_gather/lp/middlewares.py‎`

`‎code-examples/laywer_profile_gather/lp/pipelines.py‎`

`‎code-examples/laywer_profile_gather/lp/settings.py‎`

`‎code-examples/laywer_profile_gather/lp/spiders/init.py‎`

`‎code-examples/laywer_profile_gather/lp/spiders/lp_spider.py‎`

0 commit comments