This program reads all files in a folder and subfolders line by line, converts them in a supported character set, replaces all unsupported characters and inserts them in a SQL database. My idea was to insert all CSV files.
It works fine indeed but it takes a long time to insert all the data, especially for files that are larger than 100 MB. How could I improve in this code?
Imports System.IO
Imports System.Text
Imports System.Data
Imports System.Data.Sql
Imports System.Data.SqlClient
Imports System.Text.RegularExpressions
Imports System.Globalization
Public Class SQL_Data_Form
#Region "Felder"
Private _table As String = ""
#End Region
Structure SeekZeilen
Dim erste_Zeile As String()
Dim _Rest As String()
End Structure
Private Sub Form1_Load(sender As System.Object, e As System.EventArgs) Handles MyBase.Load
Dim conn As New DBUmgebung.cdb
conn.PruefPasswort("ASW@")
End Sub
Public Sub ReplaceChars(ByRef _Rest() As String)
For i As Integer = 0 To _Rest.Length - 1
_Rest(i) = _Rest(i).Replace("\t", Constants.vbTab).Replace("\n", Constants.vbLf).
Replace("\r", Constants.vbCr).Replace("\""", """").Replace("\\", "\")
Next
End Sub
Private Sub btnAusfuehren_Click(sender As System.Object, e As System.EventArgs) Handles btnAusführen.Click
Dim ListPathFiles As List(Of String) = ListFiles.GetFileList(TextBox1.Text, False)
For Each filePath As String In ListPathFiles
Dim fileInfo As New FileInfo(filePath)
_table = fileInfo.Name.Replace(fileInfo.Extension, String.Empty)
Try
View("connect", 15, "")
Dim conn As New DBUmgebung.cdb
View("connection successful", 20, "")
If Not conn.executeCommand("SELECT * FROM " & _table) Then
Exit Sub
End If
View("table check... ", 25, _table)
Catch ex As Exception
MessageBox.Show(ex.Message)
rtbSql.Focus()
View("connection closed", 100, "")
Exit Sub
End Try
Dim i As Integer
Dim r As SeekZeilen
Using reader As New StreamReader(filePath)
Dim TextFileTable As DataTable = Nothing
Dim line As String = reader.ReadLine()
Me.ParseString(line)
Dim memStream As New MemoryStream(Encoding.Default.GetBytes(line))
Using TextFileReader As New Microsoft.VisualBasic.FileIO.TextFieldParser(memStream)
TextFileReader.TextFieldType = FileIO.FieldType.Delimited
TextFileReader.SetDelimiters(";")
r.erste_Zeile = TextFileReader.ReadFields()
If TextFileTable Is Nothing Then
TextFileTable = New DataTable("TextFileTable")
For i = 0 To r.erste_Zeile.Length - 1
Dim Column As New DataColumn(r.erste_Zeile(i))
Column.ReadOnly = True
TextFileTable.Columns.Add(Column)
Next
End If
DataGridView1.DataSource = TextFileTable
End Using
While Not reader.EndOfStream
line = reader.ReadLine()
Me.ParseString(line)
memStream = New MemoryStream(Encoding.Default.GetBytes(line))
Using TextFileReader As New Microsoft.VisualBasic.FileIO.TextFieldParser(memStream)
TextFileReader.TextFieldType = FileIO.FieldType.Delimited
TextFileReader.SetDelimiters(";")
DataGridView1.DataSource = TextFileTable
Try
r._Rest = TextFileReader.ReadFields()
ReplaceChars(r._Rest)
If Not r._Rest Is Nothing Then
Dim oSQL As New DBUmgebung.cdb.SQL()
oSQL.init()
AddParameters(oSQL, r)
oSQL.ausfuehrenSQL(DBUmgebung.cdb.KSQLCommand.INSERT, _table, "")
View("insert: ", 35, String.Join(", ", r._Rest))
Dim dtRow As DataRow = TextFileTable.NewRow
For i = 0 To r._Rest.Length - 1
dtRow(i) = r._Rest(i).ToString()
Next
TextFileTable.Rows.Add(dtRow)
DataGridView1.Refresh()
Application.DoEvents()
End If
Catch ex As Microsoft.VisualBasic.FileIO.MalformedLineException
MsgBox("Error! " & ex.Message & _
"")
Catch sqlEx As SqlException
MessageBox.Show(sqlEx.Message)
Debug.WriteLine("SQL Error!")
rtbSql.Focus()
Exit For
Catch ex As Exception
MessageBox.Show(ex.Message)
rtbSql.Focus()
Exit For
End Try
End Using
End While
End Using
Next
End Sub
Private Sub ParseString(ByRef value As String)
If value.Contains("\"""";") Then
value = value.Replace("\"""";", "\"""""";")
value = value.Replace("\""""" & Environment.NewLine, "\""""""" & Environment.NewLine)
End If
If value.Contains("\""") Then
value = value.Replace("\""""", Constants.vbNullChar)
value = value.Replace("\""", "\""""")
value = value.Replace(Constants.vbNullChar, "\""""")
End If
End Sub
Private Sub View(Message As String, value As Integer, pathname As String)
RichTextBox1.AppendText(Message & pathname & Environment.NewLine)
ProgressBar1.Value = value
End Sub
Private Sub TextBox1_MouseDoubleClick(sender As System.Object, e As System.Windows.Forms.MouseEventArgs) Handles TextBox1.MouseDoubleClick
Dim result As DialogResult = FolderBrowserDialog1.ShowDialog()
If result = Windows.Forms.DialogResult.OK Then
TextBox1.Text = FolderBrowserDialog1.SelectedPath
End If
End Sub
Private Sub SQL_Data_Form_FormClosing(sender As System.Object, e As System.Windows.Forms.FormClosingEventArgs) Handles MyBase.FormClosing
Dim result As DialogResult = MessageBox.Show(Me, "close the program?", "", MessageBoxButtons.YesNo)
If result = Windows.Forms.DialogResult.Yes Then
e.Cancel = False
Else
e.Cancel = True
End If
End Sub
Private Sub Button2_Click(sender As System.Object, e As System.EventArgs) Handles Button2.Click
Dim conn As New DBUmgebung.cdb
View("connect", 15, _table)
Try
View("querying...", 50, "")
conn.executeCommand(rtbSql.Text)
View("Query successful", 90, "")
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
View("connection closed", 100, "")
End Sub
Private Sub DataGridView1_CellContentClick(sender As System.Object, e As System.Windows.Forms.DataGridViewCellEventArgs) Handles DataGridView1.CellContentClick
End Sub
Private Sub AddParameters(ByRef db As DBUmgebung.cdb.SQL, r As SeekZeilen)
Try
Dim dummyDate As Date
For i As Integer = 0 To r.erste_Zeile.Length - 1
Dim value As String = ZeichensatzAnpassen(r._Rest(i))
If value = "" Then
db.FeldWertChar(r.erste_Zeile(i), DBNull.Value)
ElseIf value Is Nothing OrElse IsDBNull(value) Then
db.FeldWertChar(r.erste_Zeile(i), DBNull.Value)
ElseIf Date.TryParseExact(value, "dd.MM.yyyy", CultureInfo.CurrentCulture, DateTimeStyles.None, dummyDate) Then
db.FeldWertDate(r.erste_Zeile(i), value)
ElseIf IsNumeric(value) Then
If value.Contains(",") Then
db.FeldWertDecimal(r.erste_Zeile(i), value.Replace(",", "."))
Else
db.FeldWertInteger(r.erste_Zeile(i), value)
End If
Else
db.FeldWertChar(r.erste_Zeile(i), value)
End If
Next
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
End Sub
Public Shared Function ZeichensatzAnpassen(ByVal input As String) As String
Dim OriginalCodierung As System.Text.Encoding = System.Text.Encoding.GetEncoding("utf-8")
Dim iso8859 As System.Text.Encoding = System.Text.Encoding.GetEncoding(1252)
Dim enc As System.Text.Encoding = System.Text.Encoding.Default
Dim OriginalBytes As Byte() = OriginalCodierung.GetBytes(input)
Dim result As Byte() = System.Text.Encoding.Convert(OriginalCodierung, iso8859, OriginalBytes)
Dim s As String = enc.GetString(result)
Return s
End Function
End Class
Public Class ListFiles
Public Shared Function GetFileList(ByVal Root As String, Optional ByVal SubFolders As Boolean = True) As List(Of String)
Dim FileList As New List(Of String)
SeekFiles(Root, FileList, SubFolders)
Return FileList
End Function
Private Shared Sub SeekFiles(ByVal Root As String, ByRef FileArray As List(Of String), ByVal SubFolders As Boolean)
Try
Dim Files() As String = System.IO.Directory.GetFiles(Root)
Dim Folders() As String = System.IO.Directory.GetDirectories(Root)
For i As Integer = 0 To UBound(Files)
FileArray.Add(Files(i).ToString)
Next
If SubFolders = True Then
For i As Integer = 0 To UBound(Folders)
SeekFiles(Folders(i), FileArray, SubFolders)
Next
End If
Catch Ex As Exception
End Try
End Sub
End Class
2 Answers 2
The way you insert the data looks a bit problematic:
- You seem to be doing a full table scan just to check that the table exists:
If Not conn.executeCommand("SELECT * FROM " & _table) Then Exit Sub
- After each line read you refresh the data table in the grid. If you want to display the values in a grid, just display them at the end.
- I'm not sure want your
ParseString
method does. Both VB.Net and CSV use "" for escaping a quote, not \" There's no need to create a
Microsoft.VisualBasic.FileIO.TextFieldParser
for each line. It can handle the entire file and also takes care of quotesDim parser as New Microsoft.VisualBasic.FileIO.TextFieldParser(fileName) parser.TextFieldType = FieldType.Delimited parser.Delimiters = New String(){";"} While Not parser.EndOfData Dim valuesFromCurrentLine = parser.ReadFields() 'process the current line End While
- Loading a 100 MB file into a Datatable can have a significant memory-footprint. Consider sending the data one line at a time to the server
- Why do want to convert utf-8 text to 1252?
- The code for
DBUmgebung.cdb.SQL
is not provided so I can't comment on that.
At places you seem to be reinventing the wheel. You've created a class ListFiles
for recursively retrieving the files from a folder, but .NET already provides this via Directory.GetFiles
Some design considerations:
- Don't mix UI code with the processing logic like that. .NET provides the BackgroundWorker
- Avoid passing arguments ByRef. String values are immutable so can be just as easily returned. Passing an array ByRef is useless unless you plan to allocate a different array.
- Use the .NET standard naming conventions for class/method/variable names.
-
\$\begingroup\$ yes, i need to convert utf8 because i get conversion error of codepage characters, and DBUmgebung.cdb.SQL ist just a Advantage Database driver \$\endgroup\$Sparkm4n– Sparkm4n2016年05月11日 11:28:27 +00:00Commented May 11, 2016 at 11:28
I improved the code a little and now the program inserts all lines in one transaction, therefore i use DataTable to save all the lines which are inserted in sql db, but it doesn't really read faster as i expected, is there any better reader than streamreader? for example Microsoft TextFileReaader? here's the code
Private Sub btnAusfuehren_Click(sender As System.Object, e As System.EventArgs) Handles btnAusführen.Click
Dim ListPathFiles As List(Of String) = ListFiles.GetFileList(TextBox1.Text, False)
For Each filePath As String In ListPathFiles
Dim fileInfo As New FileInfo(filePath)
_table = fileInfo.Name.Replace(fileInfo.Extension, String.Empty)
Try
View("connect", 15, "")
Dim conn As New DBUmgebung.cdb
View("connection successful", 20, "")
If Not conn.executeCommand("SELECT * FROM " & _table) Then
Exit Sub
End If
View("table check... ", 25, _table)
Catch ex As Exception
MessageBox.Show(ex.Message)
rtbSql.Focus()
View("connection closed", 100, "")
Exit Sub
End Try
Dim i As Integer
Dim r As SeekZeilen
Using reader As New StreamReader(filePath)
Dim cdb As New DBUmgebung.cdb()
Dim dtInserts As DataTable = cdb.GetDataTable("SELECT TOP 0 * FROM " & _table)
dtInserts.TableName = _table
Dim line As String = reader.ReadLine()
While Not reader.EndOfStream
line = reader.ReadLine()
Me.ParseString(line)
Dim memStream As New MemoryStream(Encoding.Default.GetBytes(line))
Using TextFileReader As New Microsoft.VisualBasic.FileIO.TextFieldParser(memStream)
TextFileReader.TextFieldType = FileIO.FieldType.Delimited
TextFileReader.SetDelimiters(";")
DataGridView1.DataSource = dtInserts
Try
r._Rest = TextFileReader.ReadFields()
ReplaceChars(r._Rest)
If Not r._Rest Is Nothing Then
View("insert: ", 35, String.Join(", ", r._Rest))
Dim dtRow As DataRow = dtInserts.NewRow
For i = 0 To r._Rest.Length - 1
dtRow(i) = r._Rest(i).ToString()
Next
dtInserts.Rows.Add(dtRow)
DataGridView1.Refresh()
Application.DoEvents()
End If
Catch ex As Microsoft.VisualBasic.FileIO.MalformedLineException
MsgBox("Error! " & ex.Message & _
"")
Catch sqlEx As SqlException
MessageBox.Show(sqlEx.Message)
Debug.WriteLine("SQL Error!")
rtbSql.Focus()
Exit For
Catch ex As Exception
MessageBox.Show(ex.Message)
rtbSql.Focus()
Exit For
End Try
End Using
End While
Dim sql As New DBUmgebung.cdb.SQL()
sql.ausfuehrenSQLDT(DBUmgebung.cdb.KSQLCommand.INSERT, dtInserts.TableName, String.Empty, dtInserts)
End Using
Next
End Sub
Connection conCSV; sConnectCSV = "jdbc:odbc:Driver={Microsoft Text Driver (*.txt; *.csv)};Dbq=d:\\mmi\\;Extensions=csv;Persist Security Info=False;"; conCSV = DriverManager.getConnection(sConnectCSV,"",""); Statement stmt = conCSV.createStatement();
\$\endgroup\$