|
| 1 | +from bs4 import BeautifulSoup |
| 2 | +import lxml |
| 3 | +import requests |
| 4 | +import json |
| 5 | +import datetime |
| 6 | +import sys |
| 7 | + |
| 8 | +# Util |
| 9 | +def datestr_to_date(datestr): |
| 10 | + [year, month, day] = datestr.split('-') |
| 11 | + return datetime.date( |
| 12 | + year=int(year), |
| 13 | + month=int(month), |
| 14 | + day=int(day) |
| 15 | + ) |
| 16 | + |
| 17 | +# Reference dates |
| 18 | +reference_date = datetime.date(2001, 1, 1) # 2001 Jan 1 |
| 19 | +reference_date_id = 36892 |
| 20 | + |
| 21 | +if len(sys.argv) < 3: |
| 22 | + print('economictimes_scraper.py START_DATE END_DATE\nDate fmt: YYYY-MM-DD') |
| 23 | + sys.exit(1) |
| 24 | + |
| 25 | +start_date = datestr_to_date(sys.argv[1]) |
| 26 | +end_date = datestr_to_date(sys.argv[2]) |
| 27 | +start_dateid = reference_date_id + (start_date - reference_date).days |
| 28 | +end_dateid = reference_date_id + (end_date - reference_date).days |
| 29 | + |
| 30 | +if (start_date - reference_date).days < 0: |
| 31 | + print('Error: Start date should be > than 2001年01月01日') |
| 32 | + sys.exit(1) |
| 33 | +if (end_date - start_date).days < 0: |
| 34 | + print('Error: End date should be > than Start date') |
| 35 | + sys.exit(1) |
| 36 | + |
| 37 | + |
| 38 | +# Gets News article metadata from article url |
| 39 | +def fetchNewsArticle(url): |
| 40 | + html = requests.get(url).content |
| 41 | + root = lxml.HTML(html) |
| 42 | + x = root.xpath("/html/body//script[@type='application/ld+json']") |
| 43 | + metadata = None ## When Article does not exists (404) |
| 44 | + if (len(x) >= 2): |
| 45 | + metadata = x[1].text |
| 46 | + return metadata |
| 47 | + |
| 48 | +et_host = 'https://economictimes.indiatimes.com' |
| 49 | +et_date_url = 'https://economictimes.indiatimes.com/archivelist/starttime-' |
| 50 | +et_date_extension = '.cms' |
| 51 | + |
| 52 | +fetched_data = {} |
| 53 | + |
| 54 | +for dateid in range(start_dateid, end_dateid + 1): |
| 55 | + date = str(reference_date + datetime.timedelta(days = dateid-reference_date_id)) |
| 56 | + html = requests.get('{}{}{}'.format(et_date_url, dateid, et_date_extension)).content |
| 57 | + soup = BeautifulSoup(html, 'html.parser') |
| 58 | + fetched_data[date] = [] |
| 59 | + for x in soup.select('#pageContent table li a'): |
| 60 | + print(x.text) |
| 61 | + article_metadata = fetchNewsArticle(et_host + x['href']) |
| 62 | + fetched_data[date].append({ |
| 63 | + "metadata": article_metadata, |
| 64 | + "title": x.text, |
| 65 | + "url": et_host + x['href'] |
| 66 | + }) |
| 67 | + |
| 68 | +out_filename = 'ET_NewsData_{}_{}.json'.format(start_date, end_date) |
| 69 | +with open(out_filename, 'w+') as output_file: |
| 70 | + output_file.write(json.dumps(fetched_data, indent=2)) |
0 commit comments