How to apply a function to multiple database at once?

Question 1

At the moment, I use that but I'm sure it's not efficient at all and I would like to improve my code :

df1['comms_clean'] = df1['comms'].apply(lambda x : clean_text(x))
df2['comms_clean'] = df2['comms'].apply(lambda x : clean_text(x))
df3['comms_clean'] = df3['comms'].apply(lambda x : clean_text(x))
df4['comms_clean'] = df4['comms'].apply(lambda x : clean_text(x))
df5['comms_clean'] = df5['comms'].apply(lambda x : clean_text(x))
df6['comms_clean'] = df6['comms'].apply(lambda x : clean_text(x))
df7['comms_clean'] = df7['comms'].apply(lambda x : clean_text(x))
df8['comms_clean'] = df8['comms'].apply(lambda x : clean_text(x))
df9['comms_clean'] = df9['comms'].apply(lambda x : clean_text(x))
df10['comms_clean'] = df10['comms'].apply(lambda x : clean_text(x))
df11['comms_clean'] = df11['comms'].apply(lambda x : clean_text(x))
df12['comms_clean'] = df12['comms'].apply(lambda x : clean_text(x))
df13['comms_clean'] = df13['comms'].apply(lambda x : clean_text(x))
df14['comms_clean'] = df14['comms'].apply(lambda x : clean_text(x))
df15['comms_clean'] = df15['comms'].apply(lambda x : clean_text(x))

For now I tried this but I cannot find out how to use the i in my loop :

for i in range(1,15):
 df{i}['comms_clean'] = df{i}['comms'].apply(lambda x : clean_text(x))

Here's my function clean_text() :

def clean_text(text):
 text = text.lower() #make text lowercase and fill na
 text = re.sub('\[.*?\]', '', text) 
 text = re.sub('\\n', '',str(text))
 text = re.sub("\[\[User.*",'',str(text))
 text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(text))
 text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) #remove hyperlinks
 text = re.sub(r'\:(.*?)\:', '', text) #remove emoticones
 text = re.sub(r'[\w\.-]+@[\w\.-]+', '', str(text)) #remove email
 text = re.sub(r'(?<=@)\w+', '', text) #remove @
 text = re.sub(r'[0-9]+', '', text) #remove numbers
 text = re.sub("[^A-Za-z0-9 ]", '', text) #remove non alphanumeric like ['@', '#', '.', '(', ')']
 text = re.sub(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*', '', text) #remove punctuations from sentences
 text = re.sub('<.*?>+', '', str(text))
 text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
 text = re.sub('\w*\d\w*', '', str(text))
 text = tokenizer.tokenize(text)
 text = [word for word in text if not word in stop_words]
 text = [lemmatizer.lemmatize(word) for word in text]
 #text = [stemmer.stem(word) for word in text]
 final_text = ' '.join( [w for w in text if len(w)>1] ) #remove word with one letter
 return final_text

ANd here's the code used to obtain my dataframe (that's the script for one, for the other I just change the html sources) :

import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://www.tripadvisor.ca/Hotel_Review-g186338-d215539-Reviews-or'
urls = [ '{root}{i}-OYO_Apollo_Hotel_Bayswater-London_England.html#REVIEWS'.format(root=root_url, i=i) for i in range(5,440,5) ]
comms = []
notes = []
#datestostay = []
dates = []
for url in urls: 
 results = requests.get(url)
 time.sleep(20)
 soup = BeautifulSoup(results.text, "html.parser")
 commentary = soup.find_all('div', class_='_2wrUUKlw _3hFEdNs8')
 for container in commentary:
 comm = container.find('q', class_ = 'IRsGHoPm').text.strip()
 comms.append(comm)
 #date_tag = container.find("div", class_="_1O8E5N17").text 
 #date_text,date_value = str.split(date_tag,':')
 #datestostay.append(date_value)
 comm1 = str(container.find("div", class_="nf9vGX55").find('span'))
 rat = re.findall(r'\d+', str(comm1))
 rat1 = (str(rat))[2]
 notes.append(rat1)
 datereal = container.find("div", class_= "_2fxQ4TOx").text
 date = datereal[-9:]
 dates.append(date)
data = pd.DataFrame({
 'comms' : comms,
 #'datestostay' : datestostay,
 'notes' : notes,
 'dates' : dates
 })
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data['dates'] = data.dates.dt.strftime('%Y-%m')
'''
data['datestostay'] = pd.to_datetime(data['datestostay']).dt.date
data['datestostay'] = pd.to_datetime(data['datestostay'])
data['datestostay'] = data.datestostay.dt.strftime('%Y-%m')
'''
#print(data.head())
data.to_csv('table4.csv', sep=';', index=False)

Thanks for your help :)

Question 2

Where do those df-s come from? You could put them all in a list:

dfs = [df1, df2, df3, ...] # etc

Then iterate over it:

for df in dfs:
 ... # do something with desired df

Probably, you need to refactor the code to get rid of all those dfX variables and put their values directly into a list or a dict where you create them.

Som-1 6819 silver badges17 bronze badges · Accepted Answer · 2021-01-07 13:57:17Z

Where do those df-s come from? You could put them all in a list:

dfs = [df1, df2, df3, ...] # etc

Then iterate over it:

for df in dfs:
 ... # do something with desired df

Probably, you need to refactor the code to get rid of all those dfX variables and put their values directly into a list or a dict where you create them.

CollectivesTM on Stack Overflow

How to apply a function to multiple database at once?

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CollectivesTM on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related