I'm trying to convert a shapefile containing N single point shapes to a .gdb using the FileGDBAPI
for C++
from Esri.
The structure of the shapefile is:
x | y | date1 | date2 | date3 | ....
---------------------------------------------------------------------
0 | 0 | 1 | 2 | 3 | ....
1 | 0 | 2 | 1 | 0 | ....
and so on where there could be an unknown number N of dates and Z of different points
Right now all I'm doing is creating a new .gdb
file, creating a table with these fields and populating it with the same data as the shapefile.
This all works. The problem is that it's awfully slow, to convert a shapefile
with 60k points it takes almost 20 minutes.
Is there something obvious I'm missing or that I could refactor to speed up the process a bit? Right now I'm opening the .shp
, reading the fields in the .dbf
, creating a table with those fields, populating it with the various entries in the .dbf
and closing the table and the .gdb
Here is pretty much my code (error checking removed to save space but it's in the real program). Where there are comments I can't write the complete implementation but it does what it says in the comments and it's correct and optimized.
std::vector<shapes> shape_list; //Contains the shape points
FileGDBAPI::Geodatabase geodatabase;
error = CreateGeodatabase(L"./test.gdb"), geodatabase);
FileGDBAPI::SpatialReference spatialReference;
spatialReference.SetSpatialReferenceText(prj_string);
error = geodatabase.CreateFeatureDataset(L"\\Spatial_Reference", spatialReference);
std::vector<FileGDBAPI::FieldDef> field_defs;
for (int i = 0; i < dbf_field_number; ++i){
std::string dbf_field_name = field_name(i); // this gives me the dbf field name at position i
FileGDBAPI::FieldDef current_dbf_field;
current_dbf_field.SetName(dbf_field_name);
current_dbf_field.SetType(FileGDBAPI::fieldTypeDouble);
current_dbf_field.SetIsNullable(true);
current_dbf_field.SetAlias(dbf_field_name);
current_dbf_field.SetGeometryDef(geometryDef);
field_defs.push_back(current_dbf_field);
}
FileGDBAPI::Table table;
error = geodatabase.CreateTable(L"\\Data", field_defs, L"DEFAULTS", table);
for (int i = 0; i < shape_number; ++i){
FileGDBAPI::Row row;
table.CreateRowObject(row);
FileGDBAPI::PointShapeBuffer row_point;
row_point.Setup(FileGDBAPI::shapePoint);
FileGDBAPI::Point* point;
row_point.GetPoint(point);
point->x = shape_list[i]->getX();
point->y = shape_list[i]->getY();
row->SetGeometry(dbf_row_point);
for (int j = 0; j < dbf_field_number; ++j){
std::string dbf_field_name = field_name(j);
double dbf_value = field_value(i, dbf_field_name);
error = row->SetDouble(dbf_field_name, dbf_value);
}
error = table.Insert(row);
}
error = geodatabase.CompactDatabase();
error = CloseGeodatabase(geodatabase);
1 Answer 1
The last FileGDPAPI code I wrote (which loaded 10k random points) had this helper function to populate a table:
std::wstring populateTable(Table& table)
{
int j;
fgdbError hr;
Row row;
Point *point;
PointShapeBuffer pointGeom;
short rndInt = 0;
wstring msg = L"";
wstring rndString = L"a";
static wstring array[] = {
L"a", L"bb", L"ccc", L"dddd", L"eeeee",
L"f", L"gg", L"hhh", L"iiii", L"jjjjj",
L"k", L"ll", L"mmm", L"nnnn", L"ooooo",
L"p", L"qq", L"rrr", L"ssss", L"ttttt",
L"u", L"vv", L"www", L"xxxx", L"yyyyy",
L"z", L"bb", L"hhh", L"nnnn", L"ttttt" };
/*
* .. Load-only mode
*/
if ((hr = table.SetWriteLock()) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
/*
* .. Prep
*/
if ((hr = table.CreateRowObject(row)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
if ((hr = pointGeom.Setup(shapePoint)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
if ((hr = pointGeom.GetPoint(point)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
/*
* .. Insert values
*/
for (j = 0; j < INSERT_CNT; j++) {
rndString = array[randomShort(0,STRING_CNT)];
if ((hr = row.SetString(L"RNDSTRING",rndString)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
rndInt = randomShort(1,1000);
if ((hr = row.SetShort(L"RNDINT",rndInt)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
point->x = randomDouble(-180.0,180.0);
point->y = randomDouble(-90.0,90.0);
if ((hr = row.SetGeometry(pointGeom)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
if ((hr = table.Insert(row)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
written++;
}
/*
* .. Clear write lock
*/
bailout:
if ((hr = table.FreeWriteLock()) != S_OK) {
if (msg.empty()) ErrorInfo::GetErrorDescription(hr,msg);
}
return msg;
} /* populateTable */
which was invoked from a driver loop in main:
/*
* .. Insert pseudo-random rows to FCs
*/
elapsed();
for (i = 1; i <= 10; i++) {
// Open table
if ((hr = geodatabase.OpenTable(nameTable(i),tmpTable)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
// Insert to table
msg = populateTable(tmpTable);
if (!msg.empty()) goto bailout;
// Close table
if ((hr = geodatabase.CloseTable(tmpTable)) != S_OK) {
ErrorInfo::GetErrorDescription(hr,msg);
goto bailout;
}
ms_insert += elapsed();
}
There are two main differences in our code:
- I use
table.SetWriteLock()
to obtain a write lock for bulk insert, then free it withtable.FreeWriteLock()
- You use
geodatabase.CompactDatabase()
, which forces a rewrite of all the table data files and indexes.
I expect that you'll have a much faster API experience if you set/free the write lock, and a slightly faster experience if you skip the unnecessary Compact operation.
In general, it's usually wise to add timing code when experiencing performance issues, so you can be sure where the delay occurs. This is the platform independent (Linux/Windows) timing function I used (which returns milliseconds since last invocation):
#if defined(WIN32)
# include <sys/types.h>
# include <sys/timeb.h>
#else
# include <sys/time.h>
# include <unistd.h>
#endif
double elapsed(void)
{
double secs;
#if defined(WIN32)
static struct timeb start = {0,0};
struct timeb now;
ftime(&now);
secs = (double)(now.time - start.time) * 1000.0 +
((double)now.millitm - (double)start.millitm);
#else
static struct timeval start = {0,0};
struct timeval now;
gettimeofday(&now,NULL);
secs = (double)(now.tv_sec - start.tv_sec) * 1000.0 +
(((double)now.tv_usec - (double)start.tv_usec) * 0.001);
#endif
start = now;
return secs;
} /* elapsed */
I'd guess that adding the write lock will cut the insert time to under 20 seconds, and that the Compact probably adds 10 seconds to execution.
-
adding explicit locks fixed the bulk of the performance problem, evrything else i'm ok with considering the time needed to read the shape files and write the values to disk. The execution time went down from 20 minutes to ~1-2John Doe– John Doe2021年09月24日 15:51:59 +00:00Commented Sep 24, 2021 at 15:51
Explore related questions
See similar questions with these tags.
table.SetWriteLock()
andtable.FreeWriteLock()
which means a lock needs to be obtained for each row. I also question the need for aCompactDatabase
without any random access edits.CompactDatabase
is needed either. All I'm basing the code is the (in my opinion severely lacking)FilaGDBAPI
documentation that just states: Compact the geodatabase. Recomended after bulk updates . In my view adding a whole table with many fields constitutes as "bulk update"