0

I'm trying to scrape from this website. But I'm getting unicode error. I did some searching and it seems its a encoding issue? but after adding the encoding='utf-8'it doesn't go away. not sure what the issue is.

 import bs4 as bs
 import urllib.request
 import csv
 import numpy as np
 base_url = "https://www.mobygames.com/developer/sheet/view/developerId,"
 url_list =[]
 with open('url.csv', 'r') as f:
 reader = csv.reader(f)
 for row in reader:
 url_list.append(row[0])
 def extract(gameurl):
 req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
 sauce = urllib.request.urlopen(req).read()
 soup = bs.BeautifulSoup(sauce,'lxml')
 infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
 core_list =[]
 for credits in infopage:
 niceHeaderTitle = credits.find_all("h1", {"class":"niceHeaderTitle"})
 name = niceHeaderTitle[0].text
 Titles = credits.find_all("h3", {"class":"clean"})
 Titles = [title.get_text() for title in Titles]
 tr = credits.find_all("tr")
 for i in range(len(tr)):
 row = tr[i].get_text(strip=True)
 if row in Titles:
 title = row
 elif len(row) > 1:
 games=[name,title,row]
 core_list.append(games)
 core_list = np.matrix(core_list)
 return core_list
 def csv_write(url_data):
 with open ('HRdata.csv','a',encoding='utf-8') as file:
 writer=csv.writer(file)
 for row in url_data:
 writer.writerow(row)
 for url in url_list:
 link = base_url + url 
 url_data = extract(link)
 csv_write(url_data)

I thought it was because when I'm trying to write it into csv file so I added encoding='utf-8' but it didn't work... not sure what I should do to resolve this.

This is the error message

---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-22-31928933be8c> in <module>()
 52 for url in url_list:
 53 link = base_url + url
---> 54 url_data = extract(link)
 55 csv_write(url_data)
 56 
<ipython-input-22-31928933be8c> in extract(gameurl)
 15 def extract(gameurl):
 16 req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
---> 17 sauce = urllib.request.urlopen(req).read()
 18 soup = bs.BeautifulSoup(sauce,'lxml')
 19 infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
C:\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
 221 else:
 222 opener = _opener
--> 223 return opener.open(url, data, timeout)
 224 
 225 def install_opener(opener):
C:\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
 524 req = meth(req)
 525 
--> 526 response = self._open(req, data)
 527 
 528 # post-process response
C:\Anaconda3\lib\urllib\request.py in _open(self, req, data)
 542 protocol = req.type
 543 result = self._call_chain(self.handle_open, protocol, protocol +
--> 544 '_open', req)
 545 if result:
 546 return result
C:\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
 502 for handler in handlers:
 503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
 505 if result is not None:
 506 return result
C:\Anaconda3\lib\urllib\request.py in https_open(self, req)
 1359 def https_open(self, req):
 1360 return self.do_open(http.client.HTTPSConnection, req,
-> 1361 context=self._context, check_hostname=self._check_hostname)
 1362 
 1363 https_request = AbstractHTTPHandler.do_request_
C:\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
 1316 try:
 1317 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318 encode_chunked=req.has_header('Transfer-encoding'))
 1319 except OSError as err: # timeout error
 1320 raise URLError(err)
C:\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
 1237 encode_chunked=False):
 1238 """Send a complete request to the server."""
-> 1239 self._send_request(method, url, body, headers, encode_chunked)
 1240 
 1241 def _send_request(self, method, url, body, headers, encode_chunked):
C:\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
 1248 skips['skip_accept_encoding'] = 1
 1249 
-> 1250 self.putrequest(method, url, **skips)
 1251 
 1252 # chunked encoding will happen if HTTP/1.1 is used and either
C:\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
 1115 
 1116 # Non-ASCII characters should have been eliminated earlier
-> 1117 self._output(request.encode('ascii'))
 1118 
 1119 if self._http_vsn == 11:
UnicodeEncodeError: 'ascii' codec can't encode characters in position 38-40: ordinal not in range(128) 
asked Mar 18, 2020 at 16:23
2
  • Whats the value of the url variable when this error occurs? Commented Mar 18, 2020 at 16:32
  • I think its "1" because the url.csv is just a list of numbers from 1 to 19 Commented Mar 18, 2020 at 20:18

1 Answer 1

1

http\client.py is trying to ascii encode your gameurl string, which it can't do because it includes a character that isn't in the ascii character set.

You need to URL encode the url, excluding the scheme (https://), by using the urllib.parse.quote() function. You only need to change the first line in this for loop:

 for url in url_list:
 link = base_url + urllib.parse.quote(url) # just doing the end is fine in this case
 url_data = extract(link)
 csv_write(url_data)

Alternatively, you can use the popular Requests module, which seamlessly takes care of this for you (I highly recommend it!).

answered Mar 19, 2020 at 14:01
Sign up to request clarification or add additional context in comments.

1 Comment

It turns out that python didn't like this part base_url = "mobygames.com/developer/sheet/view/developerId," after I changed base_url to a more simple form it worked??

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.