0

At the moment, I use that but I'm sure it's not efficient at all and I would like to improve my code :

df1['comms_clean'] = df1['comms'].apply(lambda x : clean_text(x))
df2['comms_clean'] = df2['comms'].apply(lambda x : clean_text(x))
df3['comms_clean'] = df3['comms'].apply(lambda x : clean_text(x))
df4['comms_clean'] = df4['comms'].apply(lambda x : clean_text(x))
df5['comms_clean'] = df5['comms'].apply(lambda x : clean_text(x))
df6['comms_clean'] = df6['comms'].apply(lambda x : clean_text(x))
df7['comms_clean'] = df7['comms'].apply(lambda x : clean_text(x))
df8['comms_clean'] = df8['comms'].apply(lambda x : clean_text(x))
df9['comms_clean'] = df9['comms'].apply(lambda x : clean_text(x))
df10['comms_clean'] = df10['comms'].apply(lambda x : clean_text(x))
df11['comms_clean'] = df11['comms'].apply(lambda x : clean_text(x))
df12['comms_clean'] = df12['comms'].apply(lambda x : clean_text(x))
df13['comms_clean'] = df13['comms'].apply(lambda x : clean_text(x))
df14['comms_clean'] = df14['comms'].apply(lambda x : clean_text(x))
df15['comms_clean'] = df15['comms'].apply(lambda x : clean_text(x))

For now I tried this but I cannot find out how to use the i in my loop :

for i in range(1,15):
 df{i}['comms_clean'] = df{i}['comms'].apply(lambda x : clean_text(x))

Here's my function clean_text() :

def clean_text(text):
 text = text.lower() #make text lowercase and fill na
 text = re.sub('\[.*?\]', '', text) 
 text = re.sub('\\n', '',str(text))
 text = re.sub("\[\[User.*",'',str(text))
 text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(text))
 text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) #remove hyperlinks
 text = re.sub(r'\:(.*?)\:', '', text) #remove emoticones
 text = re.sub(r'[\w\.-]+@[\w\.-]+', '', str(text)) #remove email
 text = re.sub(r'(?<=@)\w+', '', text) #remove @
 text = re.sub(r'[0-9]+', '', text) #remove numbers
 text = re.sub("[^A-Za-z0-9 ]", '', text) #remove non alphanumeric like ['@', '#', '.', '(', ')']
 text = re.sub(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*', '', text) #remove punctuations from sentences
 text = re.sub('<.*?>+', '', str(text))
 text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
 text = re.sub('\w*\d\w*', '', str(text))
 text = tokenizer.tokenize(text)
 text = [word for word in text if not word in stop_words]
 text = [lemmatizer.lemmatize(word) for word in text]
 #text = [stemmer.stem(word) for word in text]
 final_text = ' '.join( [w for w in text if len(w)>1] ) #remove word with one letter
 return final_text

ANd here's the code used to obtain my dataframe (that's the script for one, for the other I just change the html sources) :

import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://www.tripadvisor.ca/Hotel_Review-g186338-d215539-Reviews-or'
urls = [ '{root}{i}-OYO_Apollo_Hotel_Bayswater-London_England.html#REVIEWS'.format(root=root_url, i=i) for i in range(5,440,5) ]
comms = []
notes = []
#datestostay = []
dates = []
for url in urls: 
 results = requests.get(url)
 time.sleep(20)
 soup = BeautifulSoup(results.text, "html.parser")
 commentary = soup.find_all('div', class_='_2wrUUKlw _3hFEdNs8')
 for container in commentary:
 comm = container.find('q', class_ = 'IRsGHoPm').text.strip()
 comms.append(comm)
 #date_tag = container.find("div", class_="_1O8E5N17").text 
 #date_text,date_value = str.split(date_tag,':')
 #datestostay.append(date_value)
 comm1 = str(container.find("div", class_="nf9vGX55").find('span'))
 rat = re.findall(r'\d+', str(comm1))
 rat1 = (str(rat))[2]
 notes.append(rat1)
 datereal = container.find("div", class_= "_2fxQ4TOx").text
 date = datereal[-9:]
 dates.append(date)
data = pd.DataFrame({
 'comms' : comms,
 #'datestostay' : datestostay,
 'notes' : notes,
 'dates' : dates
 })
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data['dates'] = data.dates.dt.strftime('%Y-%m')
'''
data['datestostay'] = pd.to_datetime(data['datestostay']).dt.date
data['datestostay'] = pd.to_datetime(data['datestostay'])
data['datestostay'] = data.datestostay.dt.strftime('%Y-%m')
'''
#print(data.head())
data.to_csv('table4.csv', sep=';', index=False)

Thanks for your help :)

asked Jan 7, 2021 at 13:44

1 Answer 1

1

Where do those df-s come from? You could put them all in a list:

dfs = [df1, df2, df3, ...] # etc

Then iterate over it:

for df in dfs:
 ... # do something with desired df

Probably, you need to refactor the code to get rid of all those dfX variables and put their values directly into a list or a dict where you create them.

answered Jan 7, 2021 at 13:57
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.