Parse a String in Excel Vba
Solution 1
I've started to write a parser in VBA for the string structure specified by you, and it's not complete, but I'll post it anyways. Maybe you can pick up some ideas from it.
Sub ParseString()
Dim str As String
str = "|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|"
' Declare an object dictionary
' Make a reference to Microsoft Scripting Runtime in order for this to work
Dim dict As New Dictionary
' If the bars are present in the first and last character of the string, replace them
str = Replace(str, "|", "", 1, 1)
If (Mid(str, Len(str), 1) = "|") Then
str = Mid(str, 1, Len(str) - 1)
End If
' Split the string by bars
Dim substring_array() As String
substring_array = Split(str, "|")
' Declare a regex object
' Check the reference to Microsoft VBScript Regular Expressions 5.5 in order for this to work
Dim regex As New RegExp
With regex
.Global = True
.IgnoreCase = True
.MultiLine = True
End With
' Object to store the regex matches
Dim matches As MatchCollection
Dim param_name_matches As MatchCollection
Dim parameter_value_matches As MatchCollection
' Define some regex patterns
pattern_for_structure_name = "^[^=;]+;"
pattern_for_parameters = "[^=;]+=[^=;]+;"
pattern_for_parameter_name = "[^=;]="
pattern_for_parameter_val = "[^=;];"
' Loop through the elements of the array
Dim i As Integer
For i = 0 To UBound(substring_array) - LBound(substring_array)
' Get the array element in a string
str1 = substring_array(i)
' Check if it contains a structure name
regex.Pattern = pattern_for_structure_name
Set matches = regex.Execute(str1)
If matches.Count = 0 Then
' This substring does not contain a structure name
' Check if it contains parameters
regex.Pattern = pattern_for_parameter
Set matches = regex.Execute(matches(0).Value)
If matches.Count = 0 Then
' There are no parameters as well as no structure name
' This means the string had || - invalid string
MsgBox ("Invalid string")
Else
' The string contains parameter names
' Add each parameter name to the dictionary
Dim my_match As match
For Each my_match In matches
' Get the name of the parameter
regex.Pattern = pattern_for_parameter_name
Set parameter_name_matches = regex.Execute(my_match.Value)
' Check if the above returned any matches
If parameter_name_matches.Count = 1 Then
' Remove = sign from the parameter name
parameter_name = Replace(parameter_name_matches(0).Value, "=", "")
' Get the value of the parameter
regex.Pattern = pattern_for_parameter_value
Set parameter_value_matches = regex.Execute(my_match.Value)
' Check if the above returned any matches
If parameter_value_matches.Count = 1 Then
' Get the value
parameter_value = Replace(parameter_value_matches(0).Value, ";", "")
' Add the parameter name and value as a key pair to the Dictionary object
dict.Item(parameter_name) = parameter_value
Else
' Number of matches is either 0 or greater than 1 - in both cases the string is invalid
MsgBox ("Invalid string")
End If
Else
' Parameter name did not match - invalid string
MsgBox ("Invalid string")
End If
Next
End If
ElseIf matches.Count = 1 Then
' This substring contains a single structure name
' Check if it has parameter names
Else
' This substring contains more than one structure name - the original string is invalid
MsgBox ("Invalid string")
End If
Next i
End Sub
Solution 2
This looks like a simple nested delimited string. A couple of Split()
functions will do the trick:
Option Explicit
Function parseString(str As String) As Collection
Dim a1() As String, i1 As Long, c1 As Collection
Dim a2() As String, i2 As Long, c2 As Collection
Dim a3() As String
a1 = Split(str, "|")
Set c1 = New Collection
For i1 = LBound(a1) To UBound(a1)
If a1(i1) <> "" Then
Set c2 = New Collection
a2 = Split(a1(i1), ";")
For i2 = LBound(a2) To UBound(a2)
If a2(i2) <> "" Then
a3 = Split(a2(i2), "=")
If UBound(a3) > 0 Then
c2.Add a3(1), a3(0)
ElseIf UBound(a3) = 0 Then
c2.Add a3(0)
End If
End If
Next i2
c1.Add c2
End If
Next i1
Set parseString = c1
End Function
Sub testParseString()
Dim c As Collection
Set c = parseString("|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|")
Debug.Assert c(1)(1) = "KC"
Debug.Assert c(2)("PE") = "5"
Debug.Assert c(3)(1) = "CD"
Debug.Assert c(4)("HP") = "abc"
Debug.Assert c(4)(3) = "abc"
End Sub
Note that you can address values by both, index and key (if key existed in the input). If key was not provided you can only access the value by its index. You can also iterate collection recursively to get all the values in a tree structure.
Food for thought: since your structures may have repeated names (in your case "CD" structure happens twice) Collections / Dictionaries would find it problematic to store this elegantly (due to key collisions). Another good way to approach this is to create an XML structure with DOMDocument and use XPath to access its elements. See Program with DOM in Visual Basic
UPDATE: I've added XML example below as well. Have a look.
Solution 3
Here is another take on your string parsing issue using DOMDocument
XML parser. You need to include Microsoft XML, v.6.0 in your VBA references.
Function parseStringToDom(str As String) As DOMDocument60
Dim a1() As String, i1 As Long
Dim a2() As String, i2 As Long
Dim a3() As String
Dim dom As DOMDocument60
Dim rt As IXMLDOMNode
Dim nd As IXMLDOMNode
Set dom = New DOMDocument60
dom.async = False
dom.validateOnParse = False
dom.resolveExternals = False
dom.preserveWhiteSpace = True
Set rt = dom.createElement("root")
dom.appendChild rt
a1 = Split(str, "|")
For i1 = LBound(a1) To UBound(a1)
If a1(i1) <> "" Then
a2 = Split(a1(i1), ";")
Set nd = dom.createElement(a2(0))
For i2 = LBound(a2) To UBound(a2)
If a2(i2) <> "" Then
a3 = Split(a2(i2), "=")
If UBound(a3) > 0 Then
nd.appendChild dom.createElement(a3(0))
nd.LastChild.Text = a3(1)
End If
End If
Next i2
rt.appendChild nd
End If
Next i1
Set parseStringToDom = dom
End Function
Sub testParseStringToDom()
Dim dom As DOMDocument60
Set dom = parseStringToDom("|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|")
Debug.Assert Not dom.SelectSingleNode("/root/KC") Is Nothing
Debug.Assert dom.SelectSingleNode("/root/AD/PE").Text = "5"
Debug.Assert dom.SelectSingleNode("/root/CD[1]/HP").Text = "test"
Debug.Assert dom.SelectSingleNode("/root/CD[2]/HP").Text = "abc"
Debug.Print dom.XML
End Sub
As you can see this converts your text into an XML DOM document preserving all the structures and allowing for duplicates in naming. You can then use XPath to access any node or value. This can also be extended to have more nesting levels and further structures.
This is the XML document it creates behind the scenes:
<root>
<KC/>
<AD>
<PE>5</PE>
<PF>3</PF>
</AD>
<CD>
<PE>5</PE>
<HP>test</HP>
</CD>
<CD>
<PE>3</PE>
<HP>abc</HP>
</CD>
</root>
MeSS83
Updated on June 04, 2022Comments
-
MeSS83 almost 2 years
I have a macro that send an XMLHTTP request to a server and it gets as response a plain text string, not a JSON format string or other standard formats (at least for what I know).
I would like to parse the output string in order to access the data in an structured approach in the same fashion as the parseJson subroutine in this link
My problem is I am not good with regular expressions and I am not able to modify the routine for my needs.
The string that I need to parse has the following structure:
- The string is a single line
- Each single parameter is defined by its parameter name the equal simbol, its value and ending with; "NID=3;" or "SID=Test;"
- Parameter can be collected in "structures" starts and end with the symbol | and they are identified with their name followed by ; such as |STEST;NID=3;SID=Test;|
- A structure can contain also other structures
An example of a output string is the following
|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|
In this case there is a macro structure KC which contains a structure AD. The structure AD is composed by the parameters PE, PF and 2 structures CD. And finaly the structures CD have the parameters PE and HP
So I would like to parse the string to obtain an Object/Dictionary that reflects this structure, can you help me?
Adds after the first answers
Hi all, thank you for your help, but I think I should make more clear the output that I would like to get. For the example string that I have, I would like to have an object with the following structure:
<KC> <AD> <PE>5</PE> <PF>3</PF> <CD> <PE>5</PE> <HP>test</HP> </CD> <CD> <PE>3</PE> <HP>abc</HP> </CD> </AD> </KC>
So I started to wrote a possible working code base on some hint from @Nvj answer and the answer in this link
Option Explicit Option Base 1 Sub Test() Dim strContent As String Dim strState As String Dim varOutput As Variant strContent = "|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|" Call ParseString(strContent, varOutput, strState) End Sub Sub ParseString(ByVal strContent As String, varOutput As Variant, strState As String) ' strContent - source string ' varOutput - created object or array to be returned as result ' strState - Object|Array|Error depending on processing to be returned as state Dim objTokens As Object Dim lngTokenId As Long Dim objRegEx As Object Dim bMatched As Boolean Set objTokens = CreateObject("Scripting.Dictionary") lngTokenId = 0 Set objRegEx = CreateObject("VBScript.RegExp") With objRegEx .Global = True .MultiLine = True .IgnoreCase = True .Pattern = "\|[A-Z]{2};" 'Pattern for the name of structures Tokenize objTokens, objRegEx, strContent, lngTokenId, bMatched, "str" .Pattern = "[A-Z]{2}=[^\|=;]+;" 'Pattern for parameters name and values Tokenize objTokens, objRegEx, strContent, lngTokenId, bMatched, "par" End With End Sub Sub Tokenize(objTokens, objRegEx, strContent, lngTokenId, bMatched, strType) Dim strKey As String Dim strKeyPar As String Dim strKeyVal As String Dim strWork As String Dim strPar As String Dim strVal As String Dim strLevel As String Dim strRes As String Dim lngCopyIndex As Long Dim objMatch As Object strRes = "" lngCopyIndex = 1 With objRegEx For Each objMatch In .Execute(strContent) If strType = "str" Then bMatched = True With objMatch strWork = Replace(.Value, "|", "") strWork = Replace(strWork, ";", "") strLevel = get_Level(strWork) strKey = "<" & lngTokenId & strLevel & strType & ">" objTokens(strKey) = strWork strRes = strRes & Mid(strContent, lngCopyIndex, .FirstIndex - lngCopyIndex + 1) & strKey lngCopyIndex = .FirstIndex + .Length + 1 End With lngTokenId = lngTokenId + 1 ElseIf strType = "par" Then strKeyPar = "<" & lngTokenId & "par>" strKeyVal = "<" & lngTokenId & "val>" strKey = strKeyPar & strKeyVal bMatched = True With objMatch strWork = Replace(.Value, ";", "") strPar = Split(strWork, "=")(0) strVal = Split(strWork, "=")(1) objTokens(strKeyPar) = strPar objTokens(strKeyVal) = strVal strRes = strRes & Mid(strContent, lngCopyIndex, .FirstIndex - lngCopyIndex + 1) & strKey lngCopyIndex = .FirstIndex + .Length + 1 End With lngTokenId = lngTokenId + 2 End If Next strContent = strRes & Mid(strContent, lngCopyIndex, Len(strContent) - lngCopyIndex + 1) End With End Sub Function get_Level(strInput As String) As String Select Case strInput Case "KC" get_Level = "L1" Case "AD" get_Level = "L2" Case "CD" get_Level = "L3" Case Else MsgBox ("Error") End End Select End Function
This function creates a dictionary with an item for each structure name, parameter name and parameter value as shown in the figure Thanks to the function
get_Level
the items associated to structures have a level that should help to preserve the original hierarchy of the data.So what I am missing is a function to create an object that has the original structure of the input string. This is what the
Retrieve
function do in this answer link, but I do not know how to adapt it to my case