How to output parsed HTML into a file?

Question 1

(updated) After some help, I now have the following code. I can output to a csv file but I can't seem to get the csv to have the proper number of columns:

soup = BeautifulSoup(html_doc)
import csv
outfile=csv.writer(open('outputrows.csv','wb'),delimiter='\t')
#def get_movie_info(imdb):
tbl = soup.find('table')
rows = tbl.findAll('tr')
list=[]
for row in rows:
 cols = row.find_all('td')
 for col in cols:
 if col.has_attr('class') and col['class'][0] == 'title':
 spans = col.find_all('span')
 for span in spans:
 if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
 ID = span.get('data-tconst')
 list.append(ID)
 elif col.has_attr('class') and col['class'][0] == 'number':
 rank = col.text
 list.append(rank) 
 elif col.has_attr('class') and col['class'][0] == 'image':
 hrefs = col.find_all('a')
 for href in hrefs:
 moviename = href.get('title')
 list.append(moviename)
outfile.writerows(list)
print list

The problem is that it ouputs in this format, which is just one column of data:

1.
The Shawshank Redemption (1994)
tt0111161
2.
The Dark Knight (2008)
tt0468569
3.
Inception (2010)
tt1375666

when I want 3 columns of data as shown below:

1. The Shawshank Redemption (1994) tt0111161
2. The Dark Knight (2008) tt0468569
3. Inception (2010) tt1375666

sample html code:

 <tr class="odd detailed">
 <td class="number">
 48.
 </td>
 <td class="image">
 <a href="/title/tt0082971/" title="Raiders of the Lost Ark (1981)">
 <img alt="Raiders of the Lost Ark (1981)" height="74" src="http://ia.media-imdb.com/images/M/MV5BMjA0ODEzMTc1Nl5BMl5BanBnXkFtZTcwODM2MjAxNA@@._V1._SX54_CR0,0,54,74_.jpg" title="Raiders of the Lost Ark (1981)" width="54"/>
 </a>
 </td>
 <td class="title">
 <span class="wlb_wrapper" data-caller-name="search" data-size="small" data-tconst="tt0082971">
 </span>
 <a href="/title/tt0082971/">
 Raiders of the Lost Ark
 </a>
 <span class="year_type">
 (1981)
 </span>
 <br/>

Question 2

Can you share sample of HTML i.e. html_doc?

Question 3

ok, I added the sample HTML into the post. Ideally, I want to have a CSV with 3 columns for each of those data points

Question 4

Can you please try this (not optimized solution but should do the job):

soup = BeautifulSoup(html_doc)
def get_movie_info():
 tbl = soup.find('table')
 rows = tbl.findAll('tr')
 for row in rows:
 (imageTitle, dataTConst, number) = ('', '', '')
 cols = row.find_all('td')
 for col in cols:
 if col.has_attr('class') and col['class'][0] == 'image':
 href = col.find('a')
 imageTitle = href.get('title')
 elif col.has_attr('class') and col['class'][0] == 'title':
 span = col.find('span')
 if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
 dataTConst = span.get('data-tconst')
 elif col.has_attr('class') and col['class'][0] == 'number':
 number = col.text
 yield (imageTitle, dataTConst, number)
#################################################
import csv
outfile=csv.writer(open('outputrows.csv','wb'), delimiter='\t')
for row in get_movie_info():
 outfile.writerow(row)

Question 5

Hey now, This actually worked. The file is formatting exactly as it needs to be. Thanks for that. Also I noticed that changing the order where it says yield led to a change in the order of the columns. The order up by ('','','') didn't seem to affect the order of the columns in the file; just an fyi...anyway thanks again

Question 6

Have you try returning a list of the printed rows from the get_movie_info function?

def get_movie_info():
 returnedRows = []
 tbl = soup.find('table')
 rows = tbl.findAll('tr')
 for row in rows:
 cols = row.find_all('td')
 for col in cols:
 if col.has_attr('class') and col['class'][0] == 'image':
 hrefs = col.find_all('a')
 for href in hrefs:
 print href.get('title')
 returnedRows.append(href.get('title')) # <-- append 'title' 
 elif col.has_attr('class') and col['class'][0] == 'title':
 spans = col.find_all('span')
 for span in spans:
 if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
 print span.get('data-tconst')
 returnedRows.append(span.get('data-tconst')) # <-- append 'tconst'
 elif col.has_attr('class') and col['class'][0] == 'number':
 print col.text
 returnedRows.append(col.text) # <-- append 'number'
 return returnedRows # <-- then return the list

And executed this way,

import csv
outfile=csv.writer(open('outputrows.tsv','wb'),delimiter='\t')
rows=get_movie_info()
outfile.writerows(rows)

Question 7

Also, parenthesis in the function definition and execution would help

Question 8

@PeterGibson True, didn't even realize!

Question 9

OK...I was successful outputting everything to a csv. But now, each item is on its own line instead of having 3 columns; one for each category.

Question 10

Here is an easy way to do this:

#!/usr/bin/env python
import pandas as pd
import BeautifulSoup as BeautifulSoup
import requests
url = 'some_url.html'
r=requests.get(url)
movie_id=[]
title=[]
year=[]
bs = BeautifulSoup(r.text)
for movie in bs.findAll('td', 'title'):
 movie_id.append((movie.find('a').get('href')).split('/')[2])
 title.append(movie.find('a').contents[0])
 year.append(movie.find('span', 'year_type').contents[0])
movie_dic={'movie_id': movie_id, 'title': title, 'year': year}
movie_data = pd.DataFrame(movie_dic, index = None)
file_name = "~/movies.txt"
movie_data.to_csv(file_name, sep = ',', header = True, encoding = 'utf-8', mode = 'w')

Question 11

Thanks, I'll be sure to give this one a try as well

Chandan 7666 silver badges8 bronze badges · Accepted Answer · 2014-02-06 03:16:12Z

Can you please try this (not optimized solution but should do the job):

soup = BeautifulSoup(html_doc)
def get_movie_info():
 tbl = soup.find('table')
 rows = tbl.findAll('tr')
 for row in rows:
 (imageTitle, dataTConst, number) = ('', '', '')
 cols = row.find_all('td')
 for col in cols:
 if col.has_attr('class') and col['class'][0] == 'image':
 href = col.find('a')
 imageTitle = href.get('title')
 elif col.has_attr('class') and col['class'][0] == 'title':
 span = col.find('span')
 if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
 dataTConst = span.get('data-tconst')
 elif col.has_attr('class') and col['class'][0] == 'number':
 number = col.text
 yield (imageTitle, dataTConst, number)
#################################################
import csv
outfile=csv.writer(open('outputrows.csv','wb'), delimiter='\t')
for row in get_movie_info():
 outfile.writerow(row)

Hey now, This actually worked. The file is formatting exactly as it needs to be. Thanks for that. Also I noticed that changing the order where it says yield led to a change in the order of the columns. The order up by ('','','') didn't seem to affect the order of the columns in the file; just an fyi...anyway thanks again

CollectivesTM on Stack Overflow

How to output parsed HTML into a file?

3 Answers 3

1 Comment

3 Comments

1 Comment

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CollectivesTM on Stack Overflow

3 Answers 3

1 Comment

3 Comments

1 Comment

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related