Tuesday, September 9, 2008

Parse CSV or other separated strings

I had to work with Comma Separated Values (CSV) files the last few days and updated some code to make it easier to use with LINQ. Quick review, a CSV file is a file that has all of the values separated by a comma. Values that contain a comma are typically qualified with double quotes. For example:

Abraham, Lincoln, 03/4/1861, 04/15/1865, "Republican, National Union"

Basically, I created a class called SeparatedString that performs all the functionality of parsing the data so that I can retrieve it later. When the Text property is set it figures out the location of all relevant commas and loads them into a List. The GetValue method then uses the index to grab the occurrence of the comma that precedes the value to be retrieved.

Public Class SeparatedString
    ''' <summary>
    ''' Contains the position of all separator characters in the string
    ''' </summary>
    ''' <remarks></remarks>
    Private _Separators As New List(Of Integer)

    ''' <summary>
    ''' The CSV text of the string.
    ''' </summary>
    ''' <remarks></remarks>
    Private _Text As String
    Public Property Text() As String
        Get
            Return _Text
        End Get
        Set(ByVal value As String)
            _Text = value
            LoadSeparatorPositions()
        End Set
    End Property

    Private _Qualifier As Char = Chr(34)
    ''' <summary>
    ''' The character used to qualify text strings.
    ''' </summary>
    ''' <value>The character used to qualify text strings.</value>
    ''' <returns>Char</returns>
    ''' <remarks>
    ''' <para>
    ''' Default value: double quotes - chr(34)
    ''' </para>
    ''' Depending on the tool used to create the string, text
    ''' strings may be qualified by a character, typically double quotes.
    ''' <example>
    ''' "John", "Quincy", "Adams", 01/01/2003, 54
    ''' </example>
    ''' </remarks>
    Public Property Qualifier() As Char
        Get
            Return _Qualifier
        End Get
        Set(ByVal value As Char)
            _Qualifier = value
        End Set
    End Property

    Private _Separator As Char = ","
    ''' <summary>
    ''' The character used to separate values.
    ''' </summary>
    ''' <value>The character used to separate values.</value>
    ''' <returns>Char</returns>
    ''' <remarks>
    ''' <para>
    ''' Default value: comma
    ''' </para>
    ''' <example>
    ''' "John", "Quincy", "Adams", 01/01/2003, 54
    ''' </example>
    ''' </remarks>
    Public Property Separator() As Char
        Get
            Return _Separator
        End Get
        Set(ByVal value As Char)
            _Separator = value
        End Set
    End Property

    ''' <summary>
    ''' Loads the list of separator positions.
    ''' </summary>
    ''' <remarks>
    ''' The _Separators list is cleared and loaded with the
    ''' position of all separators. If the string is empty
    ''' or contains no unqualified separators, the list is
    ''' empty.
    ''' <para>
    ''' NOTE: Separators found within qualifiers are not
    ''' stored.
    ''' </para>
    ''' </remarks>
    Private Sub LoadSeparatorPositions()
        _Separators.Clear()
        If _Text.Trim.Length < 1 Then Return

        Dim IsInQualifier = False
        Dim CurChar As Char
        'Dim x = 0
        For x = 0 To _Text.Length - 1
            CurChar = _Text.Substring(x, 1)
            If CurChar = _Separator And Not IsInQualifier Then _Separators.Add(x)
            If CurChar = _Qualifier Then IsInQualifier = Not IsInQualifier
        Next
    End Sub

    ''' <summary>
    ''' Gets the value at the specified separator position.
    ''' </summary>
    ''' <param name="occur">The occurance of the separator that begins the value.</param>
    ''' <returns>String</returns>
    ''' <remarks>
    ''' NOTE: The list of values is zero based unless the string begins with a separator.
    ''' <example>
    ''' <![CDATA[
    ''' Dim sStr As New SeparatedString(Chr(34) & "John" & Chr(34) & ", " & Chr(34) & "Quincy" & Chr(34) & ", " & Chr(34) & "Adams" & Chr(34) & ", 07/11/1767, 80")
    ''' Console.WriteLine("First Name: " & sStr.GetValue(0))
    ''' Console.WriteLine("Middle Name: " & sStr.GetValue(1))
    ''' Console.WriteLine("Last Name: " & sStr.GetValue(2))
    ''' Console.WriteLine("Date Of Birth: " & sStr.GetValue(3))
    ''' Console.WriteLine("Age: " & sStr.GetValue(4))
    ''' ]]> 
    ''' ' You'll need to remove the CDATA start and end.
    ''' '
    ''' 'This example produces the following results:
    ''' 'First Name: John
    ''' 'Middle Name: Quincy
    ''' 'Last Name: Adams
    ''' 'Date Of Birth: 07/11/1767
    ''' 'Age: 80
    ''' </example>
    ''' </remarks>
    Public Function GetValue(ByVal occur As Integer) As String
        If occur > _Separators.Count Then Return String.Empty

        Dim start = 0
        If occur > 0 Then start = _Separators(occur - 1) + 1

        Dim value As String = ""
        If occur < _Separators.Count Then
            value = _Text.Substring(start, _Separators(occur) - start)
        Else
            value = _Text.Substring(start)
        End If
        value = value.Trim
        value = value.Replace(_Qualifier, "")
        If value.Length < 1 Then Return String.Empty
        Return value
    End Function

    ''' <summary>
    ''' Creates a new instance of the SeparatedString class.
    ''' </summary>
    ''' <remarks></remarks>
    Public Sub New()
        Separator = ","
        Qualifier = Chr(34)
    End Sub

    ''' <summary>
    ''' Creates a new instance of the SeparatedString class with the specified value.
    ''' </summary>
    ''' <remarks></remarks>
    Public Sub New(ByVal value As String)
        Text = value
    End Sub

    ''' <summary>
    ''' Creates a new instance of the SeparatedString class with the specified value, separator and qualifier.
    ''' </summary>
    ''' <remarks></remarks>
    Public Sub New(ByVal value As String, ByVal sep As Char, ByVal qual As Char)
        Separator = sep
        Qualifier = qual
        Text = value
    End Sub
End Class

You can simply use the class out of the box or inherit from SeparatedString and create a class that has meaningful properties that can be used with LINQ.

Public Class President
    Inherits SDG.FileHandler.SeparatedString

    Public Const FirstNameKey As Int16 = 0
    Public Const LastNameKey As Int16 = 1
    Public Const TermStartKey As Int16 = 2
    Public Const TermEndKey As Int16 = 3
    Public Const PartyKey As Int16 = 4

    Public ReadOnly Property FirstName() As String
        Get
            Return GetValue(FirstNameKey)
        End Get
    End Property

    Public ReadOnly Property LastName() As String
        Get
            Return GetValue(LastNameKey)
        End Get
    End Property

    Public ReadOnly Property Party() As String
        Get
            Return GetValue(PartyKey)
        End Get
    End Property

    Public ReadOnly Property TermEnd() As Date
        Get
            If IsDate(GetValue(TermEndKey)) Then Return GetValue(TermEndKey)
            Return Date.MinValue
        End Get
    End Property

    Public ReadOnly Property TermStart() As Date
        Get
            If IsDate(GetValue(TermStartKey)) Then Return GetValue(TermStartKey)
            Return Date.MinValue
        End Get
    End Property

    Public Sub New()
        MyBase.New()
    End Sub

    Public Sub New(ByVal value As String)
        MyBase.New(value)
    End Sub
End Class

Putting it all together, I simply read the file and create a List. Since LINQ can be used on generic Lists, we're all set. The following reads a CSV file of US Presidents and returns those inaugurated in January.

Dim presidents As New List(Of President)

Using dataFile = New StreamReader(TextBox1.Text)
    Dim line As String
    Do
        line = dataFile.ReadLine
        If line Is Nothing Then Exit Do
        If line.Trim.Length < 2 Then Exit Do
        presidents.Add(New President(line))
    Loop Until line Is Nothing
End Using

Dim query = From p In presidents Where p.TermStart.Month = 1 Select p

GridView1.DataSource = query.ToList
        

You can change the separator and/or qualifier. I made a conscious decision to not provide a  mechanism to override the check for a qualifier. Simply use a character that will not appear in your file to get around this check.