Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 448695c

Browse files
Data loaded with load_data.py util.
1 parent c94bea3 commit 448695c

File tree

4 files changed

+296
-0
lines changed

4 files changed

+296
-0
lines changed

‎code/ch7-databases/.idea/ch7-databases.iml‎

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎code/ch7-databases/bin/load_data.py‎

Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
import json
2+
import os
3+
import sys
4+
import time
5+
from typing import List, Optional, Dict
6+
7+
# noinspection PyPackageRequirements
8+
import progressbar
9+
from dateutil.parser import parse
10+
11+
sys.path.insert(0, os.path.abspath(os.path.join(
12+
os.path.dirname(__file__), "..")))
13+
14+
import data.db_session as db_session
15+
from data.package import Package
16+
from data.release import Release
17+
from data.user import User
18+
19+
20+
def main():
21+
init_db()
22+
session = db_session.create_session()
23+
user_count = session.query(User).count()
24+
session.close()
25+
if user_count == 0:
26+
file_data = do_load_files()
27+
users = find_users(file_data)
28+
29+
db_users = do_user_import(users)
30+
do_import_packages(file_data, db_users)
31+
32+
do_summary()
33+
34+
35+
def do_summary():
36+
session = db_session.create_session()
37+
38+
print("Final numbers:")
39+
print("Users: {:,}".format(session.query(User).count()))
40+
print("Packages: {:,}".format(session.query(Package).count()))
41+
print("Releases: {:,}".format(session.query(Release).count()))
42+
43+
44+
def do_user_import(user_lookup: Dict[str, str]) -> Dict[str, User]:
45+
print("Importing users ... ", flush=True)
46+
with progressbar.ProgressBar(max_value=len(user_lookup)) as bar:
47+
for idx, (email, name) in enumerate(user_lookup.items()):
48+
session = db_session.create_session()
49+
session.expire_on_commit = False
50+
51+
user = User()
52+
user.email = email
53+
user.name = name
54+
session.add(user)
55+
56+
session.commit()
57+
bar.update(idx)
58+
59+
print()
60+
sys.stderr.flush()
61+
sys.stdout.flush()
62+
63+
session = db_session.create_session()
64+
return {u.email: u for u in session.query(User)}
65+
66+
67+
def do_import_packages(file_data: List[dict], user_lookup: Dict[str, User]):
68+
errored_packages = []
69+
print("Importing packages and releases ... ", flush=True)
70+
with progressbar.ProgressBar(max_value=len(file_data)) as bar:
71+
for idx, p in enumerate(file_data):
72+
try:
73+
load_package(p, user_lookup)
74+
bar.update(idx)
75+
except Exception as x:
76+
errored_packages.append((p, " *** Errored out for package {}, {}".format(p.get('package_name'), x)))
77+
raise
78+
sys.stderr.flush()
79+
sys.stdout.flush()
80+
print()
81+
print("Completed packages with {} errors.".format(len(errored_packages)))
82+
for (p, txt) in errored_packages:
83+
print(txt)
84+
85+
86+
def do_load_files() -> List[dict]:
87+
data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../data/pypi-top-100'))
88+
print("Loading files from {}".format(data_path))
89+
files = get_file_names(data_path)
90+
print("Found {:,} files, loading ...".format(len(files)), flush=True)
91+
time.sleep(.1)
92+
93+
file_data = []
94+
with progressbar.ProgressBar(max_value=len(files)) as bar:
95+
for idx, f in enumerate(files):
96+
file_data.append(load_file_data(f))
97+
bar.update(idx)
98+
99+
sys.stderr.flush()
100+
sys.stdout.flush()
101+
print()
102+
return file_data
103+
104+
105+
def find_users(data: List[dict]) -> dict:
106+
print("Discovering users...", flush=True)
107+
found_users = {}
108+
109+
with progressbar.ProgressBar(max_value=len(data)) as bar:
110+
for idx, p in enumerate(data):
111+
info = p.get('info')
112+
found_users.update(get_email_and_name_from_text(info.get('author'), info.get('author_email')))
113+
found_users.update(get_email_and_name_from_text(info.get('maintainer'), info.get('maintainer_email')))
114+
bar.update(idx)
115+
116+
sys.stderr.flush()
117+
sys.stdout.flush()
118+
print()
119+
print("Discovered {:,} users".format(len(found_users)))
120+
print()
121+
122+
return found_users
123+
124+
125+
def get_email_and_name_from_text(name: str, email: str) -> dict:
126+
data = {}
127+
128+
if not name or not email:
129+
return data
130+
131+
emails = email.strip().lower().split(',')
132+
names = name
133+
if len(email) > 1:
134+
names = name.strip().split(',')
135+
136+
for n, e in zip(names, emails):
137+
if not n or not e:
138+
continue
139+
140+
data[e.strip()] = n.strip()
141+
142+
return data
143+
144+
145+
def load_file_data(filename: str) -> dict:
146+
try:
147+
with open(filename, 'r', encoding='utf-8') as fin:
148+
data = json.load(fin)
149+
except Exception as x:
150+
print("ERROR in file: {}, details: {}".format(filename, x), flush=True)
151+
raise
152+
153+
return data
154+
155+
156+
def load_package(data: dict, user_lookup: Dict[str, User]):
157+
try:
158+
info = data.get('info', {})
159+
160+
p = Package()
161+
p.id = data.get('package_name', '').strip()
162+
if not p.id:
163+
return
164+
165+
p.author = info.get('author')
166+
p.author_email = info.get('author_email')
167+
168+
releases = build_releases(p.id, data.get("releases", {}))
169+
170+
if releases:
171+
p.created_date = releases[0].created_date
172+
173+
maintainers_lookup = get_email_and_name_from_text(info.get('maintainer'), info.get('maintainer_email'))
174+
maintainers = []
175+
176+
p.summary = info.get('summary')
177+
p.description = info.get('description')
178+
179+
p.home_page = info.get('home_page')
180+
p.docs_url = info.get('docs_url')
181+
p.package_url = info.get('package_url')
182+
183+
p.author = info.get('author')
184+
p.author_email = info.get('author_email')
185+
p.license = detect_license(info.get('license'))
186+
187+
session = db_session.create_session()
188+
session.add(p)
189+
session.add_all(releases)
190+
if maintainers:
191+
session.add_all(maintainers)
192+
session.commit()
193+
session.close()
194+
except OverflowError:
195+
# What the heck, people just putting fake data in here
196+
# Size is terabytes...
197+
pass
198+
except Exception:
199+
raise
200+
201+
202+
def detect_license(license_text: str) -> Optional[str]:
203+
if not license_text:
204+
return None
205+
206+
license_text = license_text.strip()
207+
208+
if len(license_text) > 100 or '\n' in license_text:
209+
return "CUSTOM"
210+
211+
license_text = license_text \
212+
.replace('Software License', '') \
213+
.replace('License', '')
214+
215+
if '::' in license_text:
216+
# E.g. 'License :: OSI Approved :: Apache Software License'
217+
return license_text \
218+
.split(':')[-1] \
219+
.replace(' ', ' ') \
220+
.strip()
221+
222+
return license_text.strip()
223+
224+
225+
def build_releases(package_id: str, releases: dict) -> List[Release]:
226+
db_releases = []
227+
for k in releases.keys():
228+
all_releases_for_version = releases.get(k)
229+
if not all_releases_for_version:
230+
continue
231+
232+
v = all_releases_for_version[-1]
233+
234+
r = Release()
235+
r.package_id = package_id
236+
r.major_ver, r.minor_ver, r.build_ver = make_version_num(k)
237+
r.created_date = parse(v.get('upload_time'))
238+
r.comment = v.get('comment_text')
239+
r.url = v.get('url')
240+
r.size = int(v.get('size', 0))
241+
242+
db_releases.append(r)
243+
244+
return db_releases
245+
246+
247+
def make_version_num(version_text):
248+
major, minor, build = 0, 0, 0
249+
if version_text:
250+
version_text = version_text.split('b')[0]
251+
parts = version_text.split('.')
252+
if len(parts) == 1:
253+
major = try_int(parts[0])
254+
elif len(parts) == 2:
255+
major = try_int(parts[0])
256+
minor = try_int(parts[1])
257+
elif len(parts) == 3:
258+
major = try_int(parts[0])
259+
minor = try_int(parts[1])
260+
build = try_int(parts[2])
261+
262+
return major, minor, build
263+
264+
265+
def try_int(text) -> int:
266+
try:
267+
return int(text)
268+
except:
269+
return 0
270+
271+
272+
def init_db():
273+
top_folder = os.path.dirname(__file__)
274+
rel_file = os.path.join('..', 'db', 'pypi.sqlite')
275+
db_file = os.path.abspath(os.path.join(top_folder, rel_file))
276+
db_session.global_init(db_file)
277+
278+
279+
def get_file_names(data_path: str) -> List[str]:
280+
files = []
281+
for f in os.listdir(data_path):
282+
if f.endswith('.json'):
283+
files.append(
284+
os.path.abspath(os.path.join(data_path, f))
285+
)
286+
287+
files.sort()
288+
return files
289+
290+
291+
if __name__ == '__main__':
292+
main()

‎code/ch7-databases/db/pypi.sqlite‎

1.87 MB
Binary file not shown.

‎code/ch7-databases/requirements.txt‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@ git+https://github.com/mikeckennedy/fastapi-chameleon
77

88
starlette==0.13.6
99
SQLAlchemy==1.3.22
10+
progressbar2
11+
python-dateutil
12+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /