Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment @Mathias Ettinger comment.
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code @Mathias Ettinger 's code is faster. :-)
Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment.
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code is faster. :-)
Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment.
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code is faster. :-)
Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment. After
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code is faster. :-)
import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
recs = {}
for line in file:
# if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
k = vlist.pop(0).strip()
if line.endswith(V_CONTINUE):
for line in file:
# add all lines ending with 4 spaces to the Value
vlist.append(line.strip())
if not line.endswith(V_CONTINUE):
break
# User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
if k == 'User_Header':
for line in file:
if line == V_TERMINED:
break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
else:
vlist.append(line.strip())
yield recs # we yield the result after having read User_Header
recs = {}
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
recs[k] = '\n'.join(map(str.strip, vlist))
if OBS_REPORT in line and recs:
yield recs
def get_raw_files():
raw_path = input('RAW Files Folder path: ')
if not raw_path.endswith('\\'):
raw_path = raw_path + '\\'
return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
with open(raw, 'r', encoding='latin-1') as infile:
for rec_dict in get_rec_dict(infile):
rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
print('{} RAW files loaded.'.format(len(raw_files)))
else:
print('No file found.')
Here's the edited code after the improvements suggested by @SuperBiasedMan. After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference :
import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
recs = {}
for line in file:
# if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
k = vlist.pop(0).strip()
if line.endswith(V_CONTINUE):
for line in file:
# add all lines ending with 4 spaces to the Value
vlist.append(line.strip())
if not line.endswith(V_CONTINUE):
break
# User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
if k == 'User_Header':
for line in file:
if line == V_TERMINED:
break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
else:
vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
recs[k] = '\n'.join(map(str.strip, vlist))
if OBS_REPORT in line and recs:
yield recs
def get_raw_files():
raw_path = input('RAW Files Folder path: ')
if not raw_path.endswith('\\'):
raw_path = raw_path + '\\'
return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
with open(raw, 'r', encoding='latin-1') as infile:
for rec_dict in get_rec_dict(infile):
rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
print('{} RAW files loaded.'.format(len(raw_files)))
else:
print('No file found.')
Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment.
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code is faster. :-)
import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
recs = {}
for line in file:
# if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
k = vlist.pop(0).strip()
if line.endswith(V_CONTINUE):
for line in file:
# add all lines ending with 4 spaces to the Value
vlist.append(line.strip())
if not line.endswith(V_CONTINUE):
break
# User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
if k == 'User_Header':
for line in file:
if line == V_TERMINED:
break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
else:
vlist.append(line.strip())
yield recs # we yield the result after having read User_Header
recs = {}
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
recs[k] = '\n'.join(map(str.strip, vlist))
def get_raw_files():
raw_path = input('RAW Files Folder path: ')
if not raw_path.endswith('\\'):
raw_path = raw_path + '\\'
return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
with open(raw, 'r', encoding='latin-1') as infile:
for rec_dict in get_rec_dict(infile):
rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
print('{} RAW files loaded.'.format(len(raw_files)))
else:
print('No file found.')
import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
recs = {}
for line in file:
# if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
k = vlist.pop(0).strip()
if line.endswith(V_CONTINUE):
for line in file:
# add all lines endinfending with 4 spaces to the Value
vlist.append(line.strip())
if not line.endswith(V_CONTINUE):
break
# User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
if k == 'User_Header':
for line in file:
if line == V_TERMINED: # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
break # if we encounter a line that is V_TERMINED, we are sure we got all the Value breakalready
else:
vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
recs[k] = '\n'.join(map(str.strip, vlist))
if OBS_REPORT in line and recs:
yield recs
def get_raw_files():
raw_path = input('RAW Files Folder path: ')
if not raw_path.endswith('\\'):
raw_path = raw_path + '\\'
return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
with open(raw, 'r', encoding='latin-1') as infile:
for rec_dict in get_rec_dict(infile):
rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
print('{} RAW files loaded.'.format(len(raw_files)))
else:
print('No file found.')
import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
recs = {}
for line in file:
# if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
k = vlist.pop(0).strip()
if line.endswith(V_CONTINUE):
for line in file:
# add all lines endinf with 4 spaces to the Value
vlist.append(line.strip())
if not line.endswith(V_CONTINUE):
break
# User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
if k == 'User_Header':
for line in file:
if line == V_TERMINED: # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
break
else:
vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
recs[k] = '\n'.join(map(str.strip, vlist))
if OBS_REPORT in line and recs:
yield recs
def get_raw_files():
raw_path = input('RAW Files Folder path: ')
if not raw_path.endswith('\\'):
raw_path = raw_path + '\\'
return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
with open(raw, 'r', encoding='latin-1') as infile:
for rec_dict in get_rec_dict(infile):
rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
print('{} RAW files loaded.'.format(len(raw_files)))
else:
print('No file found.')
import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
recs = {}
for line in file:
# if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
k = vlist.pop(0).strip()
if line.endswith(V_CONTINUE):
for line in file:
# add all lines ending with 4 spaces to the Value
vlist.append(line.strip())
if not line.endswith(V_CONTINUE):
break
# User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
if k == 'User_Header':
for line in file:
if line == V_TERMINED: break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
else:
vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
recs[k] = '\n'.join(map(str.strip, vlist))
if OBS_REPORT in line and recs:
yield recs
def get_raw_files():
raw_path = input('RAW Files Folder path: ')
if not raw_path.endswith('\\'):
raw_path = raw_path + '\\'
return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
with open(raw, 'r', encoding='latin-1') as infile:
for rec_dict in get_rec_dict(infile):
rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
print('{} RAW files loaded.'.format(len(raw_files)))
else:
print('No file found.')