getting SAXParseException not well-formed (invalid token), unable to resolve issue

I need to parse a very large xml in scrapy. It is some what like,

<Result>
    <Node>
        <browseNodeId>306533011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">temperature-controllers</attribute>
        </browseNodeAttributes>
        <browseNodeName>Temperature Controllers</browseNodeName>
        <browseNodeStoreContextName>Temperature Controllers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,306533011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Temperature Controllers</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>TEMPERATURE_CONTROLLER</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>9931457011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">industrial-and-scientific-temperature-indicators</attribute>
        </browseNodeAttributes>
        <browseNodeName>Temperature Indicators</browseNodeName>
        <browseNodeStoreContextName>Temperature Indicators</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,9931457011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Temperature Indicators</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>5006547011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">industrial-temperature-sensors</attribute>
        </browseNodeAttributes>
        <browseNodeName>Temperature Probes & Sensors</browseNodeName>
        <browseNodeStoreContextName>Temperature Probes & Sensors</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,5006547011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Temperature Probes & Sensors</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>9931455011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">thermal-imagers</attribute>
        </browseNodeAttributes>
        <browseNodeName>Thermal Imagers</browseNodeName>
        <browseNodeStoreContextName>Thermal Imagers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,9931455011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Thermal Imagers</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>393280011</browseNodeId>
        <browseNodeAttributes count="0"/>
        <browseNodeName>Thermometers</browseNodeName>
        <browseNodeStoreContextName>Thermometers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,393280011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Thermometers</browsePathByName>
        <hasChildren>true</hasChildren>
        <childNodes count="4">
            <id>393282011</id>
            <id>393284011</id>
            <id>393283011</id>
            <id>9931459011</id>
        </childNodes>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>393282011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">industrial-and-scientific-dial-thermometers</attribute>
        </browseNodeAttributes>
        <browseNodeName>Dial Thermometers</browseNodeName>
        <browseNodeStoreContextName>Dial Thermometers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,393280011,393282011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Thermometers,Dial Thermometers</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>393284011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">science-lab-digital-thermometers</attribute>
        </browseNodeAttributes>
        <browseNodeName>Digital Thermometers</browseNodeName>
        <browseNodeStoreContextName>Lab Digital Thermometers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,393280011,393284011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Thermometers,Digital Thermometers</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>LAB_SUPPLY</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>393283011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">industrial-and-scientific-glass-thermometers</attribute>
        </browseNodeAttributes>
        <browseNodeName>Glass Thermometers</browseNodeName>
        <browseNodeStoreContextName>Glass Thermometers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,393280011,393283011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Thermometers,Glass Thermometers</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
    <Node>
        <browseNodeId>9931459011</browseNodeId>
        <browseNodeAttributes count="1">
            <attribute name="item_type_keyword">infrared-thermometers</attribute>
        </browseNodeAttributes>
        <browseNodeName>Infrared Thermometers</browseNodeName>
        <browseNodeStoreContextName>Infrared Thermometers</browseNodeStoreContextName>
        <browsePathById>16310091,16310161,256409011,5006566011,393280011,9931459011</browsePathById>
        <browsePathByName>Industrial & Scientific,Test, Measure & Inspect,Temperature & Humidity,Thermometers,Infrared Thermometers</browsePathByName>
        <hasChildren>false</hasChildren>
        <childNodes count="0"/>
        <productTypeDefinitions>PRECISION_MEASURING</productTypeDefinitions>
        <refinementsInformation count="0"/>
    </Node>
</Result>

It’s giving me xml.sax._exceptions.SAXParseException: nodes.xml:11:38: not well-formed (invalid token) error. As the size of xml file is very large, I can’t opt for replacing each and every ampersand.

At this moment I’ve not implemented it using scrapy. Although a simple class for reference is below. How can this be trouble-shooted without replacing each and every ampersand.

import xml.sax


class ABContentHandler(xml.sax.ContentHandler):
    def __init__(self):
        xml.sax.ContentHandler.__init__(self)

    def startElement(self, name, attrs):
        print("startElement '" + name + "'")
        if name == "address":
            print("tattribute type='" + attrs.getValue("type") + "'")

    def endElement(self, name):
        print("endElement '" + name + "'")

    def characters(self, content):
        print("characters '" + content + "'")

def main(sourceFileName):
    source = open(sourceFileName)
    xml.sax.parse(source, ABContentHandler())

if __name__ == "__main__":
    main("nodes.xml")

Output

startElement 'Result'
characters '
'
characters '    '
startElement 'Node'
characters '
'
characters '        '
startElement 'browseNodeId'
characters '306533011'
endElement 'browseNodeId'
characters '
'
characters '        '
startElement 'browseNodeAttributes'
characters '
'
characters '            '
startElement 'attribute'
characters 'temperature-controllers'
endElement 'attribute'
characters '
'
characters '        '
endElement 'browseNodeAttributes'
characters '
'
characters '        '
startElement 'browseNodeName'
characters 'Temperature Controllers'
endElement 'browseNodeName'
characters '
'
characters '        '
startElement 'browseNodeStoreContextName'
characters 'Temperature Controllers'
endElement 'browseNodeStoreContextName'
characters '
'
characters '        '
Traceback (most recent call last):
  File "/home/gtac/sax/parser.py", line 26, in <module>
    main("nodes.xml")
  File "/home/gtac/sax/parser.py", line 23, in main
    xml.sax.parse(source, ABContentHandler())
  File "/usr/lib/python2.7/xml/sax/__init__.py", line 33, in parse
    parser.parse(source)
  File "/usr/lib/python2.7/xml/sax/expatreader.py", line 107, in parse
    xmlreader.IncrementalParser.parse(self, source)
  File "/usr/lib/python2.7/xml/sax/xmlreader.py", line 123, in parse
    self.feed(buffer)
  File "/usr/lib/python2.7/xml/sax/expatreader.py", line 214, in feed
    self._err_handler.fatalError(exc)
  File "/usr/lib/python2.7/xml/sax/handler.py", line 38, in fatalError
    raise exception
xml.sax._exceptions.SAXParseException: nodes.xml:11:38: not well-formed (invalid token)
startElement 'browsePathById'
characters '16310091,16310161,256409011,5006566011,306533011'
endElement 'browsePathById'
characters '
'
characters '        '
startElement 'browsePathByName'
characters 'Industrial '

Process finished with exit code 1


Source: xml

Leave a Reply