Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit f8f6199

Browse files
committed
Add parser class from Banyan and the lawyer profile gather Scrapy project.
1 parent c107d6a commit f8f6199

File tree

13 files changed

+1252
-0
lines changed

13 files changed

+1252
-0
lines changed

‎code-examples/banyan_parser.py‎

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
BanyanParser.py
5+
Created by Robert Dempsey on 1/23/15.
6+
Copyright (c) 2015 Robert Dempsey. All rights reserved.
7+
8+
Taken from Banyan: https://github.com/rdempsey/banyan
9+
"""
10+
11+
from pyparsing import *
12+
from bin.LocalFile import *
13+
from bin.Mailer import *
14+
from bin.WebSearch import *
15+
from bin.LocalProject import *
16+
17+
lower = str.lower
18+
19+
20+
class BanyanParser:
21+
def __init__(self, **kwargs):
22+
self.properties = kwargs
23+
24+
# Input
25+
@property
26+
def input(self):
27+
return self.properties.get('input', 'None')
28+
29+
@input.setter
30+
def input(self, s):
31+
self.properties['input'] = s
32+
33+
@input.deleter
34+
def input(self):
35+
del self.properties['input']
36+
37+
def parse(self):
38+
"""
39+
Commands
40+
word :: group of alphabetic characters
41+
command :: the first word of the sentence
42+
command_object :: what the command needs to do
43+
44+
Questions
45+
question :: the first word of the sentence begins with a contraction
46+
"""
47+
48+
# Put the input into a string
49+
input = self.input
50+
51+
# Parse Actions
52+
join_tokens = lambda tokens : " ".join(tokens)
53+
54+
# Define grammar
55+
comma = Literal(",").suppress()
56+
command = oneOf("check Check create Create open Open search Search get Get email Email tweet Tweet")
57+
act_on = oneOf("project file web locally")
58+
command_object = OneOrMore(Word(alphas+"'."))
59+
what_time = oneOf("current today's tomorrow's")
60+
subject = Literal("subject")
61+
62+
# Assign parse actions
63+
command_object.setParseAction(join_tokens)
64+
65+
# Commands
66+
create_open_search = command("command") + act_on("act_on") + command_object("name")
67+
get = command("command") + what_time("time") + command_object("object")
68+
email = command("command") + command_object("email_to") + comma + subject + command_object("email_subject")
69+
tweet = command("command") + command_object("tweet")
70+
launch_check = command("command") + command_object("app")
71+
72+
try:
73+
w = command.parseString(input)
74+
w_command = lower(w[0])
75+
if w_command == "create":
76+
c = create_open_search.parseString(input)
77+
if c.act_on == "project":
78+
system("say Shall I store the project in a private repo?")
79+
save_in_github = raw_input("Save in Github > ")
80+
p = LocalProject()
81+
p.create_new_project(c.name, save_in_github)
82+
elif c.act_on == "file":
83+
#TODO: add create file
84+
pass
85+
elif w_command == "check":
86+
chk = launch_check.parseString(input)
87+
if chk.app == "email":
88+
SayGmailCount().start()
89+
SayADSCount().start()
90+
SayDC2Count().start()
91+
elif w_command == "open":
92+
c = create_open_search.parseString(input)
93+
if c.act_on == "file":
94+
f = LocalFile()
95+
f.open_file(c.name)
96+
elif c.act_on == "project":
97+
#TODO: add open project
98+
pass
99+
else:
100+
pass
101+
elif w_command == "search":
102+
s = create_open_search.parseString(input)
103+
if s.act_on == "web":
104+
ws = WebSearch()
105+
ws.perform_search(s.name)
106+
elif w_command == "email":
107+
e = email.parseString(input)
108+
print("email: {}, subject: {}".format(e.email_to, e.email_subject))
109+
elif w_command == "tweet":
110+
#TODO: add tweeting
111+
# t = tweet.parseString(input)
112+
pass
113+
else:
114+
print("I don't know what you want me to do...")
115+
except Exception as e:
116+
system("say Please enter a valid command")
117+
print("Error: {}".format(str(e)))
118+
119+
120+
121+
if __name__ == '__main__':
122+
pass

‎code-examples/laywer_profile_gather/lp/__init__.py‎

Whitespace-only changes.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
class LpItem(scrapy.Item):
12+
# define the fields for your item here like:
13+
name = scrapy.Field()
14+
address_locality = scrapy.Field()
15+
address_region = scrapy.Field()
16+
job_title = scrapy.Field()
17+
profile_link = scrapy.Field()
18+
pass
19+
20+
class LpProfile(scrapy.Item):
21+
profile_url = scrapy.Field()
22+
name = scrapy.Field()
23+
firm = scrapy.Field()
24+
firm_address_1 = scrapy.Field()
25+
firm_address_2 = scrapy.Field()
26+
firm_city = scrapy.Field()
27+
firm_state = scrapy.Field()
28+
firm_zipcode = scrapy.Field()
29+
firm_phone_1 = scrapy.Field()
30+
firm_phone_2 = scrapy.Field()
31+
firm_fax = scrapy.Field()
32+
firm_url = scrapy.Field()
33+
position = scrapy.Field()
34+
practice_area_1 = scrapy.Field()
35+
practice_area_2 = scrapy.Field()
36+
practice_area_3 = scrapy.Field()
37+
practice_area_4 = scrapy.Field()
38+
practice_area_5 = scrapy.Field()
39+
practice_area_6 = scrapy.Field()
40+
pass
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import os
2+
import random
3+
from scrapy.conf import settings
4+
from fake_useragent import UserAgent
5+
6+
7+
class RandomUserAgentMiddleware(object):
8+
def __init__(self):
9+
super(RandomUserAgentMiddleware, self).__init__()
10+
self.ua = UserAgent()
11+
12+
def process_request(self, request, spider):
13+
request.headers.setdefault('User-Agent', self.ua.random)
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
from scrapy.conf import settings
9+
from scrapy.exceptions import DropItem
10+
from scrapy import log
11+
import csv
12+
import string
13+
import unicodedata
14+
from lp import settings
15+
16+
exclude = set(string.punctuation)
17+
18+
19+
def fully_clean_item(x):
20+
"""
21+
Helper function to perform all the available cleanings
22+
"""
23+
# Remove unicode characters
24+
x = str(x.encode('utf8'))
25+
# Remove extra commas
26+
x = x.replace(",","")
27+
# Remove extra whitespace from the front and back
28+
x = x.strip()
29+
# Remove all extra line breaks and tabs
30+
x = x.replace("\n","")
31+
x = x.replace("\t","")
32+
return x
33+
34+
35+
def write_to_csv(item):
36+
writer = csv.writer(open(settings.lawyer_profile_links_file, 'a'), lineterminator='\n')
37+
writer.writerow([item[key] for key in item.keys()])
38+
39+
40+
def write_to_profiles_csv(item):
41+
writer = csv.writer(open(settings.lawyer_profiles_file, 'a'), lineterminator='\n')
42+
writer.writerow([item[key] for key in item.keys()])
43+
44+
45+
class LpPipeline(object):
46+
def process_item(self, item, spider):
47+
write_to_csv(item)
48+
return item
49+
50+
51+
class ProfilesPipeline(object):
52+
def process_item(self, item, spider):
53+
for key in item.keys():
54+
item[key] = fully_clean_item(item[key])
55+
write_to_profiles_csv(item)
56+
return item
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for lp project
4+
#
5+
# For simplicity, this file contains only the most important settings by
6+
# default. All the other settings are documented here:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
#
10+
11+
BOT_NAME = 'lp'
12+
13+
SPIDER_MODULES = ['lp.spiders']
14+
NEWSPIDER_MODULE = 'lp.spiders'
15+
16+
# Be nice to the sites we're crawling
17+
AUTOTHROTTLE_ENABLED = True
18+
AUTOTHROTTLE_START_DELAY = 5.0
19+
AUTOTHROTTLE_MAX_DELAY = 60.0
20+
DOWNLOAD_DELAY = 5
21+
22+
# Disable cookies
23+
COOKIES_ENABLED = False
24+
25+
# Define which pipeline we'll use
26+
ITEM_PIPELINES = ['lp.pipelines.ProfilesPipeline']
27+
28+
# On the Pi
29+
start_urls_file = '/home/pi/Dev/profile_gather/lp/data/start_urls.csv'
30+
lawyer_profile_links_file = '/home/pi/Dev/profile_gather/lp/data/lawyer_profile_urls_updated.csv'
31+
lawyer_profiles_file = '/home/pi/Dev/profile_gather/lp/data/lawyer_profiles.csv'
32+
33+
# Use the fake_useragent python library
34+
# Requires you to: pip install fake-useragent
35+
DOWNLOADER_MIDDLEWARES = {
36+
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
37+
'lp.middlewares.RandomUserAgentMiddleware': 400,
38+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from scrapy import Spider
2+
from scrapy.selector import Selector
3+
from lp.items import LpItem
4+
from lp import settings
5+
import csv
6+
7+
8+
class LpSpider(Spider):
9+
name = "lp"
10+
allowed_domains = ["MAIN_URL_HERE"]
11+
start_urls = []
12+
13+
with open(settings.start_urls_file, 'r') as csvfile:
14+
data = csv.reader(csvfile)
15+
for row in data:
16+
for column in row:
17+
start_urls.append(column)
18+
19+
def parse(self, response):
20+
lawyers = Selector(response).xpath('//div[@class="block_content"]/a')
21+
22+
for lawyer in lawyers:
23+
item = LpItem()
24+
item['profile_link'] = lawyer.xpath(
25+
'@href').extract()[0]
26+
item['name'] = lawyer.xpath(
27+
'div[@class="bp_listings_result_header"]/text()').extract()[0]
28+
item['address_locality'] = lawyer.xpath(
29+
'div[@class="bp_listings_result_address"]/span[@itemprop="addressLocality"]/text()').extract()[0]
30+
item['address_region'] = lawyer.xpath(
31+
'div[@class="bp_listings_result_address"]/span[@itemprop="addressRegion"]/text()').extract()[0]
32+
item['job_title'] = lawyer.xpath(
33+
'div[@class="bp_listings_result_description"]/text()').extract()[0]
34+
yield item

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /