Return to Answer

replaced http://codereview.stackexchange.com/ with https://codereview.stackexchange.com/

edited Apr 13, 2017 at 12:40

Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment @Mathias Ettinger comment.
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code @Mathias Ettinger 's code is faster. :-)

Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment.
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code is faster. :-)

fixed the code following suggestions

Source Link

edited Jan 19, 2016 at 7:38

YeO

edited Jan 19, 2016 at 7:38

YeO

Here's the edited (and fixed) code after the improvements suggested by @SuperBiasedMan and taking inconsideration @Mathias Ettinger comment. After
My code was indeed broken and was only returning the same record.
After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference.
To be noted: @Mathias Ettinger 's code is faster. :-)

import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
 recs = {}
 for line in file:
 # if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
 if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
 vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
 k = vlist.pop(0).strip()
 if line.endswith(V_CONTINUE):
 for line in file:
 # add all lines ending with 4 spaces to the Value
 vlist.append(line.strip())
 if not line.endswith(V_CONTINUE):
 break
 # User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
 if k == 'User_Header':
 for line in file:
 if line == V_TERMINED:
 break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
 else:
 vlist.append(line.strip())
 yield recs # we yield the result after having read User_Header
 recs = {}
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
 recs[k] = '\n'.join(map(str.strip, vlist))
 if OBS_REPORT in line and recs:
 yield recs
def get_raw_files():
 raw_path = input('RAW Files Folder path: ')
 if not raw_path.endswith('\\'):
 raw_path = raw_path + '\\'
 return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
 with open(raw, 'r', encoding='latin-1') as infile:
 for rec_dict in get_rec_dict(infile):
 rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
 print('{} RAW files loaded.'.format(len(raw_files)))
else:
 print('No file found.')

Here's the edited code after the improvements suggested by @SuperBiasedMan. After some more tests, I reverted to the for loop to build the records list as it seems to be slightly faster, I have kept the suggestion as comment and for reference :

import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
 recs = {}
 for line in file:
 # if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
 if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
 vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
 k = vlist.pop(0).strip()
 if line.endswith(V_CONTINUE):
 for line in file:
 # add all lines ending with 4 spaces to the Value
 vlist.append(line.strip())
 if not line.endswith(V_CONTINUE):
 break
 # User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
 if k == 'User_Header':
 for line in file:
 if line == V_TERMINED:
 break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
 else:
 vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
 recs[k] = '\n'.join(map(str.strip, vlist))
 if OBS_REPORT in line and recs:
 yield recs
def get_raw_files():
 raw_path = input('RAW Files Folder path: ')
 if not raw_path.endswith('\\'):
 raw_path = raw_path + '\\'
 return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
 with open(raw, 'r', encoding='latin-1') as infile:
 for rec_dict in get_rec_dict(infile):
 rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
 print('{} RAW files loaded.'.format(len(raw_files)))
else:
 print('No file found.')

import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
 recs = {}
 for line in file:
 # if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
 if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
 vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
 k = vlist.pop(0).strip()
 if line.endswith(V_CONTINUE):
 for line in file:
 # add all lines ending with 4 spaces to the Value
 vlist.append(line.strip())
 if not line.endswith(V_CONTINUE):
 break
 # User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
 if k == 'User_Header':
 for line in file:
 if line == V_TERMINED:
 break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
 else:
 vlist.append(line.strip())
 yield recs # we yield the result after having read User_Header
 recs = {}
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
 recs[k] = '\n'.join(map(str.strip, vlist))
def get_raw_files():
 raw_path = input('RAW Files Folder path: ')
 if not raw_path.endswith('\\'):
 raw_path = raw_path + '\\'
 return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
 with open(raw, 'r', encoding='latin-1') as infile:
 for rec_dict in get_rec_dict(infile):
 rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
 print('{} RAW files loaded.'.format(len(raw_files)))
else:
 print('No file found.')

Post Made Community Wiki by 200_success

occurred Jan 18, 2016 at 18:01

typo

Source Link

edited Jan 18, 2016 at 17:20

YeO

edited Jan 18, 2016 at 17:20

YeO

import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
 recs = {}
 for line in file:
 # if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
 if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
 vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
 k = vlist.pop(0).strip()
 if line.endswith(V_CONTINUE):
 for line in file:
 # add all lines endinfending with 4 spaces to the Value
 vlist.append(line.strip())
 if not line.endswith(V_CONTINUE):
 break
 # User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
 if k == 'User_Header':
 for line in file:
 if line == V_TERMINED: # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
 break # if we encounter a line that is V_TERMINED, we are sure we got all the Value breakalready
 else:
 vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
 recs[k] = '\n'.join(map(str.strip, vlist))
 if OBS_REPORT in line and recs:
 yield recs
def get_raw_files():
 raw_path = input('RAW Files Folder path: ')
 if not raw_path.endswith('\\'):
 raw_path = raw_path + '\\'
 return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
 with open(raw, 'r', encoding='latin-1') as infile:
 for rec_dict in get_rec_dict(infile):
 rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
 print('{} RAW files loaded.'.format(len(raw_files)))
else:
 print('No file found.')

import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
 recs = {}
 for line in file:
 # if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
 if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
 vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
 k = vlist.pop(0).strip()
 if line.endswith(V_CONTINUE):
 for line in file:
 # add all lines endinf with 4 spaces to the Value
 vlist.append(line.strip())
 if not line.endswith(V_CONTINUE):
 break
 # User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
 if k == 'User_Header':
 for line in file:
 if line == V_TERMINED: # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
 break
 else:
 vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
 recs[k] = '\n'.join(map(str.strip, vlist))
 if OBS_REPORT in line and recs:
 yield recs
def get_raw_files():
 raw_path = input('RAW Files Folder path: ')
 if not raw_path.endswith('\\'):
 raw_path = raw_path + '\\'
 return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
 with open(raw, 'r', encoding='latin-1') as infile:
 for rec_dict in get_rec_dict(infile):
 rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
 print('{} RAW files loaded.'.format(len(raw_files)))
else:
 print('No file found.')

import glob
import pandas as pd
RAW_EXT = '.raw'
OBS_REPORT = '=' * 15 # identifies a set of Observations (Observer Report)
SUB_REPORT = '[\n' # identifies a Sub-Report within the main set
KV_SEPARATOR = ' : ' # the Key-Value Separator
V_CONTINUE = ' \n' # if the line ends with four space, the Value continues on the next line
V_TERMINED = ' \n' # if the line is 2 spaces and LF, we got to the end of the Value
def get_rec_dict(file):
 recs = {}
 for line in file:
 # if KV_SEPARATOR is found and the line is not a Sub Report Header, then we have a Key and the start of a Value
 if KV_SEPARATOR in line and not line.endswith(SUB_REPORT):
 vlist = line.split(KV_SEPARATOR) # the Key is the left of the separator
 k = vlist.pop(0).strip()
 if line.endswith(V_CONTINUE):
 for line in file:
 # add all lines ending with 4 spaces to the Value
 vlist.append(line.strip())
 if not line.endswith(V_CONTINUE):
 break
 # User_Header may not use the 4 spaces to indicate multi-line, so we read until we are sure Value is all captured
 if k == 'User_Header':
 for line in file:
 if line == V_TERMINED: break # if we encounter a line that is V_TERMINED, we are sure we got all the Value already
 else:
 vlist.append(line.strip())
## recs[k] = '\n'.join(val.strip() for val in vlist) was slower
 recs[k] = '\n'.join(map(str.strip, vlist))
 if OBS_REPORT in line and recs:
 yield recs
def get_raw_files():
 raw_path = input('RAW Files Folder path: ')
 if not raw_path.endswith('\\'):
 raw_path = raw_path + '\\'
 return glob.glob('{0}*{1}'.format(raw_path, RAW_EXT))
rec_list = []
raw_files = get_raw_files()
# Main loop
for raw in raw_files:
 with open(raw, 'r', encoding='latin-1') as infile:
 for rec_dict in get_rec_dict(infile):
 rec_list.append(rec_dict)
## rec_list += list(get_rec_dict(infile))
df = pd.DataFrame(rec_list)
if raw_files:
 print('{} RAW files loaded.'.format(len(raw_files)))
else:
 print('No file found.')