Fast replacement of XML node values

15,557

Solution 1

Thanks to everyone who contributed! I ran a performance test on a set of 2000 XML documents using my DOM implementation, Sergej's StAX implementation and Ben's XSLT implementation as well as another implementation of my own, using regular expressions. The results came out as follows:

  • DOM: 23,93s
  • StAX: 20,37s
  • XSLT: 83,52s
  • Regex: 7,83s

And here is the winner:

public String replaceWithFalseData(String xmlInstance) {
    Pattern personPattern = Pattern.compile("<Person>.*?</Person>", Pattern.DOTALL);
    Matcher personMatcher = personPattern.matcher(xmlInstance);
    StringBuffer xmlBuffer = new StringBuffer();

    while(personMatcher.find()) {
        String personXml = personMatcher.group();

        Pattern idPattern = Pattern.compile("<personID>(.*)</personID>");
        Matcher idMatcher = idPattern.matcher(personXml);
        idMatcher.find();
        String id = idMatcher.group(1);
        Person fakePerson = getFakePerson(id);

        personXml = personXml.replaceFirst("<firstName>.*</firstName>",
                "<firstName>" + fakePerson.getFirstName() + "</firstName>");

        personXml = personXml.replaceFirst("<lastName>.*</lastName>",
                "<lastName>" + fakePerson.getLastName() + "</lastName>");

        personXml = personXml.replaceFirst("<address>.*</address>",
                "<address>" + fakePerson.getAddress() + "</address>");

        personXml = personXml.replaceFirst("<personID>.*</personID>",
                "<personID>" + fakePerson.getPersonID() + "</personID>");

        personMatcher.appendReplacement(xmlBuffer, personXml);
    }

    personMatcher.appendTail(xmlBuffer);
    return xmlBuffer.toString();
}

Solution 2

You are using DOM based API. Faster replacement can be achieved with Streaming API for XML (StAX) which in many cases, can outperform the DOM-based API: StAX versus DOM

DOM API occupies more memory than StAX, which can degrade performance, but is easier to use than StAX API.

Working solution for your example - tested on 150 MB xml file, replaced in 10 sec:

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;


public class ReplaceXmlWithFakeUser
{
  public static void main(String[] args) throws XMLStreamException, IOException
  {
    XMLInputFactory inFactory = XMLInputFactory.newInstance();
    XMLEventReader eventReader = inFactory.createXMLEventReader(new BufferedInputStream(new FileInputStream("c:\\temp\\persons.xml")));
    XMLOutputFactory factory = XMLOutputFactory.newInstance();
    XMLEventWriter writer = factory.createXMLEventWriter(new BufferedOutputStream(new FileOutputStream("c:\\temp\\fakePersons.xml")));
    XMLEventFactory eventFactory = XMLEventFactory.newInstance();
    while (eventReader.hasNext())
    {
      XMLEvent event = eventReader.nextEvent();

      if (event.getEventType() == XMLEvent.START_ELEMENT &&
        event.asStartElement().getName().toString().equals("Person"))
      {
        //write Person startElement:
        writer.add(event);


        /*
        STEP 1:
        personId is at the end of Person element. Cannot overwrite firstName and address element with fake data yet. Must call getFakePerson() first.
        Iterate till you read Person END element and just remember all events within person element which we will overwrite with fake data in step 2.
         */
        Person fakePerson=null;

        List<XMLEvent> eventsWithinPersonElement = new ArrayList<XMLEvent>();

        event = eventReader.nextEvent();
        while(!(event.getEventType() == XMLEvent.END_ELEMENT && event.asEndElement().getName().toString().equals("Person")))
        {

          eventsWithinPersonElement.add(event);

          if(event.getEventType() == XMLEvent.START_ELEMENT &&
              event.asStartElement().getName().toString().equals("personID"))
          {
            XMLEvent personIDContentEvent = eventReader.nextEvent();

            String personId = personIDContentEvent.asCharacters().toString();
            fakePerson = getFakePerson(personId);

            eventsWithinPersonElement.add(personIDContentEvent);
          }

          event = eventReader.nextEvent();
        }
        XMLEvent personEndElement=event;


        //STEP 2:
        for (Iterator<XMLEvent> eventWithinPersonElementIterator = eventsWithinPersonElement.iterator(); eventWithinPersonElementIterator.hasNext(); )
        {
          XMLEvent eventWithinPersonElement = eventWithinPersonElementIterator.next();

          writer.add(eventWithinPersonElement);

          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("personID"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.personId));

            //skip personId event
            eventWithinPersonElementIterator.next();
          }
          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("firstName"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.firstName));

            //skip real firstName
            eventWithinPersonElementIterator.next();
          }
          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("lastName"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.lastName));

            //skip real firstName
            eventWithinPersonElementIterator.next();
          }
          else if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("address"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.address));

            //skip real address
            eventWithinPersonElementIterator.next();

          }
        }

        writer.add(personEndElement);
      }
      else
      {
        writer.add(event);
      }
    }
    writer.close();
  }

  private static Person getFakePerson(String personId)
  {
    //create simple fake user...

    Person fakePerson = new Person();
    fakePerson.personId = personId;
    fakePerson.firstName = "fake first name: " + Math.random();
    fakePerson.lastName = "fake last name: " + Math.random();
    fakePerson.address = "fake address: " + Math.random();

    return fakePerson;
  }

  static class Person
  {
    String personId;
    String firstName;
    String lastName;
    String address;

  }
}

Use persons.xml as input:

<ADocument>
    <Stuff>
        <StuffA></StuffA>
    </Stuff>
    <OtherStuff>
        <OtherStuff>
            <ABC>yada yada</ABC>
        </OtherStuff>
    </OtherStuff>

    <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
    </Person>
    <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>Another Person</firstName>
        <address>Main St. 2</address>
        <personID>222222222222</personID>
    </Person>
    <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
    </Person>

    <MoreStuff>
        <foo></foo>
        <foo>fooo</foo>
        <foo><bar></bar></foo>
        <foo>
            <bar></bar>
            <bar/>
            <bar>bb</bar>
        </foo>
        <bar/>
    </MoreStuff>

</ADocument>

Producing this fakePersons.xml result:

<?xml version="1.0" encoding="UTF-8"?><ADocument>
    <Stuff>
        <StuffA></StuffA>
    </Stuff>
    <OtherStuff>
        <OtherStuff>
            <ABC>yada yada</ABC>
        </OtherStuff>
    </OtherStuff>

    <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>fake first name: 0.9518514637129984</firstName>
        <lastName>fake last name: 0.3495378044884426</lastName>
        <personID>111111111111</personID>
    </Person>
    <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>fake first name: 0.8945739434355868</firstName>
        <address>fake address: 0.40784763231471777</address>
        <personID>222222222222</personID>
    </Person>
    <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>fake first name: 0.7863207851479257</firstName>
        <lastName>fake last name: 0.09918620445731652</lastName>
        <personID>111111111111</personID>
    </Person>

    <MoreStuff>
        <foo></foo>
        <foo>fooo</foo>
        <foo><bar></bar></foo>
        <foo>
            <bar></bar>
            <bar></bar>
            <bar>bb</bar>
        </foo>
        <bar></bar>
    </MoreStuff>

</ADocument>
Share:
15,557
aznan
Author by

aznan

Updated on June 04, 2022

Comments

  • aznan
    aznan almost 2 years

    I have a bunch of XML documents which contain personal information that I need to replace with fake data. The Person node contains the following elements:

    • uuid - required, should not be touched.
    • firstName - optional
    • lastName - optional
    • address - optional
    • personID - required

    A person may appear many times, in which case the same fake data should be used, i.e. if two Person nodes have the same personID, they should both recieve the same fake ID.

    I have implemented some Java code that builds a DOM tree from the XML string and replaces the nodes before writing it back to a string. This works fine, but since I have so many documents I was wondering if there is a faster approach. Maybe through regular expressions or XSLT or something?

    Here is an example document:

    <ADocument>
      <Stuff>
        ...
      </Stuff>
      <OtherStuff>
        ...
      </OtherStuff>
      <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
      </Person>
      <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>Another Person</firstName>
        <address>Main St. 2</address>
        <personID>222222222222</personID>
      </Person>
      <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
      </Person>
      <MoreStuff>
        ...
      </MoreStuff>
    </ADocument>
    

    And this is my current implementation:

    public String replaceWithFalseData(String xmlInstance) {
        Document dom = toDOM(xmlInstance);
    
        XPathExpression xPathExpression = XPathExpressionFactory.createXPathExpression("//Person");
        List<Node> nodeList = xPathExpression.evaluateAsNodeList(dom);
    
        for(Node personNode : nodeList) {
            Map<String, Node> childNodes = getChildNodes(personNode);
            String personID = childNodes.get("personID").getTextContent();
            // Retrieve a cached fake person using the ID, or create a new one if none exists.
            Person fakePerson = getFakePerson(personID);
    
            setIfExists(childNodes.get("firstName"), fakePerson.getFirstName());
            setIfExists(childNodes.get("lastName"), fakePerson.getLastName());
            setIfExists(childNodes.get("address"), fakePerson.getAddress());
            setIfExists(childNodes.get("personID"), fakePerson.getPersonID());
        }
    
        return toString(dom);
    }
    
    public Map<String, Node> getChildNodes(Node parent) {
        Map<String, Node> childNodes = new HashMap<String, Node>();
        for(Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) {
            if(child.getLocalName() != null) {
                childNodes.put(child.getLocalName(), child);
            }
        }
        return childNodes;
    }
    
    public void setIfExists(Node node, String value) {
        if(node != null) {
            node.setTextContent(value);
        }
    }