Parse OpenFoam files in Python

Question 1

I'm writing a Python parser for OpenFoam mesh files. When the mesh is big, I face performance issues.

Here is the format of the file describing the points:

/*--------------------------------*- C++ -*----------------------------------*\
| ========= | |
| \\ / F ield | OpenFOAM: The Open Source CFD Toolbox |
| \\ / O peration | Version: 2.2.0 |
| \\ / A nd | Web: www.OpenFOAM.org |
| \\/ M anipulation | |
\*---------------------------------------------------------------------------*/
FoamFile
{
 version 2.0;
 format ascii;
 class vectorField;
 location "constant/polyMesh";
 object points;
}
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
10
(
(2.14633 0.955 -0.627026)
(2.14633 1.005 -0.627026)
(4.0935 0.955 -0.389604)
(4.0935 1.005 -0.389604)
(0.199157 0.955 -0.864447)
(0.199157 1.005 -0.864447)
(3.075 1.005 0.562347)
(3.11114 1.005 0.558563)
(3.075 0.955 0.562347)
(3.11114 0.955 0.558563)
)
// ************************************************************************* //

The file describing the tetrahedron points:

/*--------------------------------*- C++ -*----------------------------------*\
| ========= | |
| \\ / F ield | OpenFOAM: The Open Source CFD Toolbox |
| \\ / O peration | Version: 2.2.0 |
| \\ / A nd | Web: www.OpenFOAM.org |
| \\/ M anipulation | |
\*---------------------------------------------------------------------------*/
FoamFile
{
 version 2.0;
 format ascii;
 class faceList;
 location "constant/polyMesh";
 object faces;
}
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
10
(
3(566037 390932 236201)
3(566037 146948 390932)
3(146948 236201 390932)
3(566037 236201 146948)
3(833456 434809 832768)
3(833456 832768 833463)
3(832768 434809 833463)
3(833456 833463 434809)
3(151487 504429 264888)
3(151487 264888 391870)
)
// ************************************************************************* //

Here is an example of boundary file:

/*--------------------------------*- C++ -*----------------------------------*\
| ========= | |
| \\ / F ield | OpenFOAM: The Open Source CFD Toolbox |
| \\ / O peration | Version: 2.2.0 |
| \\ / A nd | Web: www.OpenFOAM.org |
| \\/ M anipulation | |
\*---------------------------------------------------------------------------*/
FoamFile
{
 version 2.0;
 format ascii;
 class polyBoundaryMesh;
 location "constant/polyMesh";
 object boundary;
}
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
2
(
 object_surf
 {
 type wall;
 physicalType wall;
 nFaces 48738;
 startFace 9010058;
 }
 vacuum_surf
 {
 type patch;
 physicalType patch;
 nFaces 167218;
 startFace 9112924;
 }
)

And the class I wrote to parse these files:

class ParsedMesh:
 """ rep is the path to the directory containing the mesh files """
 def __init__(self,rep):
 def readFile(ficName):
 """ readFile: read a file to parse. Returne a list of the lines of the file without "\n" and ";" character """
 fic = open(os.path.join(rep,ficName),"r")
 tmp = [ line.replace(';','').replace('\n','').strip() for line in fic ] # delete \n and ;
 return [ line for line in tmp if line != '' ] # don't return the empty lines
 def parseHeader(self):
 res = {}
 headerSection = False
 ### header parsing
 for el in self.fileContent:
 if el == "FoamFile":
 headerSection = True
 continue
 if headerSection == True:
 if el == "{":
 continue
 elif el == "}":
 headerSection = False
 return res
 else:
 tmpEl = el.replace('"','').split()
 res[tmpEl[0]] = tmpEl[1]
 continue
 def parseBoundaryFile(self):
 self.fileContent = readFile("boundary")
 self.parsedMesh["boundary"]= {}
 self.parsedMesh["boundary"]["sections"]= {}
 # header
 self.parsedMesh["boundary"]["header"] = parseHeader(self)
 ## body
 boundarySection = False
 boundaryInnerSection = False
 for el in self.fileContent:
 if el.split()[0] == "(": # beginning of the values section
 boundarySection = True
 continue
 if el.split()[0] == ")": # end of the values section
 boundarySection = False
 break
 if el == "{":
 boundaryInnerSection = True
 continue
 if el == "}":
 boundaryInnerSection = False
 continue
 # read values
 if boundarySection == True:
 if boundaryInnerSection == False:
 boundName = el
 self.parsedMesh["boundary"]["sections"][boundName] = {}
 continue
 else:
 tmpEl = el.split()
 self.parsedMesh["boundary"]["sections"][boundName][tmpEl[0]] = tmpEl[1]
 continue
 def parsePointsFile(self):
 self.fileContent = readFile("points")
 self.parsedMesh["points"]= {}
 # header
 self.parsedMesh["points"]["header"] = parseHeader(self)
 ## body
 pointsSection = False
 pointNumber = 0
 self.parsedMesh["points"]["valuesList"] = []
 for el in self.fileContent:
 if el == "(": # beginning of the value section
 pointsSection = True
 continue
 if el == ")": # end of the value section
 pointsSection = False
 break
 # read the values
 if pointsSection == True:
 pointNumber += 1
 self.parsedMesh["points"]["valuesList"].append(numpy.array([float(el2) for el2 in el[1:-1].split()]))
 continue
 def parseFacesFile(self):
 self.fileContent = readFile("faces")
 self.parsedMesh["faces"]= {}
 # header
 self.parsedMesh["faces"]["header"] = parseHeader(self)
 ## body
 pointsSection = False
 pointNumber = 0
 self.parsedMesh["faces"]["valuesList"] = []
 for el in self.fileContent:
 if el == "(": # beginning of the value section
 pointsSection = True
 continue
 if el == ")": # end of the value section
 pointsSection = False
 break
 # read the values
 if pointsSection == True:
 pointNumber += 1
 self.parsedMesh["faces"]["valuesList"].append([int(el2) for el2 in el[2:-1].split()])
 continue
 self.parsedMesh = {}
 self.fileContent = []
 parseBoundaryFile(self)
 parsePointsFile(self)
 parseFacesFile(self)

Any idea allowing a performance improvement is appreciated. Any other comment is also welcome (I'm a physicist using Python, so probably making a lot of obvious mistakes).

Question 2

Can you post an example of a boundary file, too?

Question 3

I edited the post. :) Note that the points, faces and boundary values are not correct in the examples i give. But this is not important for the parsing work.

Question 4

A few suggestions:

Follow pep8. Your code is pretty good already, but your naming in particular is not correct.
Although I can see the advantage of grouping everything into a class, the individual file parsers are not directly dependent on the class. I would split those out into their own functions. This will make testing in particular easier. You can then have wrapper methods in the class that call the parser functions.
You are loading every file completely into a list. This takes a lot of memory for large files. Worse, it requires parsing the entire list twice, once for the header and once for the body. This is likely a big source of your performance issues. It would be much more memory-efficient to iterate over the lines in the file. I would recommend turning the reader into a generate that takes an iterator (which will usually be the file, but could be arbitrary lists of strings for testing), does the stripping, yields the stripped line, and skips empty lines. This has the added advantage that it will keep track of your progress, so you don't need to go back and read through the header again when you want to parse the body.
If you use a generator, you can create a for loop that runs until it reaches the part you want, then breaks, then you can have a second for loop that picks up where the first left off. This will greatly reduce the number of if tests you have to do.
You are parsing the list of numbers yourself. Don't, numpy has a fromstring function that can parse a string to a numpy array for you. It is much faster than your approach. This is also likely a major source of performance issues.
You should always use with for opening files. This safely closes them even when there is an error. In the default Python version, files are closed automatically when the function exits, but that doesn't necessarily happen in other Python implementations like Pypy. It is much safer to use with to close the files exactly when you intend to.
You can use tuple unpacking to split the lines for your dicts. So key, value = el.split().
You create a class, but then parse everything into a single dict that holds everything. That defeats the purpose of having a class. You should either parse the components into class attributes, or you should just use functions and return a single dict.
You hard-code the file names. I would make the file names arguments, with the default argument being the default name.
rep is a directory, not a repetition. The repetition may be in the directory name, but there is no reason it has to be. So this is stylistic, but I would call it dirname or something. There is no reason to mentally limit how you organize your files like that.
You make all the parsers subfunctions of __init__. This again defeats the purpose of having a class. They should be methods.
Your classes should derive from object.

So overall, here is how I would structure the code:

def clean_lines(lines):
 for line in lines:
 line = line.strip().strip(';')
 if not line:
 continue
 yield line
def consume_lines(lines, targ):
 for line in lines:
 if line == targ:
 return
def header_parser(lines):
 consume_lines(lines, '{')
 res = {}
 for line in lines:
 if line == '}':
 break
 key, value = line.split(maxsplit=1)
 res[key] = value.strip('"')
 return res
def boundary_parser(lines):
 consume_lines(lines, '(')
 sections = {}
 for line in lines:
 if line == ')':
 break
 if line != '{':
 name = line
 sections[name] = {}
 continue
 for subline in lines:
 if subline == '}':
 break
 key, value = subline.split(maxsplit=1)
 sections[name][key] = value
 return sections
def points_parser(lines):
 consume_lines(lines, '(')
 points = []
 for line in lines:
 if line == ')':
 break
 points.append(np.fromstring(line[1:-1], sep=' '))
 return points
def faces_parser(lines):
 consume_lines(lines, '(')
 faces = []
 for line in lines:
 if line == ')':
 break
 faces.append(np.fromstring(line[2:-1], dtype=np.int32, sep=' '))
 return faces
class ParsedMesh(object):
 def __init__(self, dirname):
 self.dirname = dirname
 self.parse_boundary_file()
 self.parse_points_file()
 self.parse_faces_file()
 def _parser(self, parser, fname, dirname):
 if dirname is None:
 dirname = self.dirname
 if dirname:
 fname = os.path.join(dirname, fname)
 with open(fname) as fobj:
 lines = clean_lines(fobj)
 header = header_parser(lines)
 parsed = parser(lines)
 return parsed, header
 def parse_boundary_file(self, fname='boundary', dirname=None):
 self.boundary, self.boundary_hdr = self._parser(boundary_parser,
 fname=fname,
 dirname=dirname)
 def parse_points_file(self, fname='points', dirname=None):
 self.points, self.points_hdr = self._parser(points_parser,
 fname=fname,
 dirname=dirname)
 def parse_faces_file(self, fname='faces', dirname=None):
 self.faces, self.faces_hdr = self._parser(faces_parser,
 fname=fname,
 dirname=dirname)

Question 5

line.strip(';').strip() should be line.strip().strip(';'), or perhaps strip('\n ;')

Question 6

usefull post! I just had to make a few changes in split() calls because i use python 2.7 Now i will study your comments thx!

Question 7

JanneKarila: indeed there was a little error in this line. Your proposal fixes it.

Question 8

@user59330 The reason I don't like that approach is that it won't handle all whitespace.

Question 9

Your version won't handle tabs, but split() will. There are also a couple dozen other unicode whitespace characters your code won't handle but split will.

TheBlackCat TheBlackCat 2,44412 silver badges10 bronze badges · Accepted Answer · 2015-08-18 15:38:02Z

A few suggestions:

Follow pep8. Your code is pretty good already, but your naming in particular is not correct.
Although I can see the advantage of grouping everything into a class, the individual file parsers are not directly dependent on the class. I would split those out into their own functions. This will make testing in particular easier. You can then have wrapper methods in the class that call the parser functions.
You are loading every file completely into a list. This takes a lot of memory for large files. Worse, it requires parsing the entire list twice, once for the header and once for the body. This is likely a big source of your performance issues. It would be much more memory-efficient to iterate over the lines in the file. I would recommend turning the reader into a generate that takes an iterator (which will usually be the file, but could be arbitrary lists of strings for testing), does the stripping, yields the stripped line, and skips empty lines. This has the added advantage that it will keep track of your progress, so you don't need to go back and read through the header again when you want to parse the body.
If you use a generator, you can create a for loop that runs until it reaches the part you want, then breaks, then you can have a second for loop that picks up where the first left off. This will greatly reduce the number of if tests you have to do.
You are parsing the list of numbers yourself. Don't, numpy has a fromstring function that can parse a string to a numpy array for you. It is much faster than your approach. This is also likely a major source of performance issues.
You should always use with for opening files. This safely closes them even when there is an error. In the default Python version, files are closed automatically when the function exits, but that doesn't necessarily happen in other Python implementations like Pypy. It is much safer to use with to close the files exactly when you intend to.
You can use tuple unpacking to split the lines for your dicts. So key, value = el.split().
You create a class, but then parse everything into a single dict that holds everything. That defeats the purpose of having a class. You should either parse the components into class attributes, or you should just use functions and return a single dict.
You hard-code the file names. I would make the file names arguments, with the default argument being the default name.
rep is a directory, not a repetition. The repetition may be in the directory name, but there is no reason it has to be. So this is stylistic, but I would call it dirname or something. There is no reason to mentally limit how you organize your files like that.
You make all the parsers subfunctions of __init__. This again defeats the purpose of having a class. They should be methods.
Your classes should derive from object.

So overall, here is how I would structure the code:

def clean_lines(lines):
 for line in lines:
 line = line.strip().strip(';')
 if not line:
 continue
 yield line
def consume_lines(lines, targ):
 for line in lines:
 if line == targ:
 return
def header_parser(lines):
 consume_lines(lines, '{')
 res = {}
 for line in lines:
 if line == '}':
 break
 key, value = line.split(maxsplit=1)
 res[key] = value.strip('"')
 return res
def boundary_parser(lines):
 consume_lines(lines, '(')
 sections = {}
 for line in lines:
 if line == ')':
 break
 if line != '{':
 name = line
 sections[name] = {}
 continue
 for subline in lines:
 if subline == '}':
 break
 key, value = subline.split(maxsplit=1)
 sections[name][key] = value
 return sections
def points_parser(lines):
 consume_lines(lines, '(')
 points = []
 for line in lines:
 if line == ')':
 break
 points.append(np.fromstring(line[1:-1], sep=' '))
 return points
def faces_parser(lines):
 consume_lines(lines, '(')
 faces = []
 for line in lines:
 if line == ')':
 break
 faces.append(np.fromstring(line[2:-1], dtype=np.int32, sep=' '))
 return faces
class ParsedMesh(object):
 def __init__(self, dirname):
 self.dirname = dirname
 self.parse_boundary_file()
 self.parse_points_file()
 self.parse_faces_file()
 def _parser(self, parser, fname, dirname):
 if dirname is None:
 dirname = self.dirname
 if dirname:
 fname = os.path.join(dirname, fname)
 with open(fname) as fobj:
 lines = clean_lines(fobj)
 header = header_parser(lines)
 parsed = parser(lines)
 return parsed, header
 def parse_boundary_file(self, fname='boundary', dirname=None):
 self.boundary, self.boundary_hdr = self._parser(boundary_parser,
 fname=fname,
 dirname=dirname)
 def parse_points_file(self, fname='points', dirname=None):
 self.points, self.points_hdr = self._parser(points_parser,
 fname=fname,
 dirname=dirname)
 def parse_faces_file(self, fname='faces', dirname=None):
 self.faces, self.faces_hdr = self._parser(faces_parser,
 fname=fname,
 dirname=dirname)

line.strip(';').strip() should be line.strip().strip(';'), or perhaps strip('\n ;')
usefull post! I just had to make a few changes in split() calls because i use python 2.7 Now i will study your comments thx!
JanneKarila: indeed there was a little error in this line. Your proposal fixes it.
@user59330 The reason I don't like that approach is that it won't handle all whitespace.
Your version won't handle tabs, but split() will. There are also a couple dozen other unicode whitespace characters your code won't handle but split will.

Stack Exchange Network

Parse OpenFoam files in Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Parse OpenFoam files in Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions