001 /* 002 * Copyright 2007 - 2007 JEuclid, http://jeuclid.sf.net 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 /* $Id: Parser.java,v 2bab6eb875e8 2010/08/11 16:45:50 max $ */ 018 019 package net.sourceforge.jeuclid.parser; 020 021 import java.io.BufferedInputStream; 022 import java.io.FilterInputStream; 023 import java.io.IOException; 024 import java.io.InputStream; 025 import java.io.Reader; 026 import java.lang.ref.Reference; 027 import java.lang.ref.SoftReference; 028 import java.util.Map; 029 import java.util.concurrent.ConcurrentHashMap; 030 import java.util.zip.ZipEntry; 031 import java.util.zip.ZipInputStream; 032 033 import javax.annotation.concurrent.ThreadSafe; 034 import javax.xml.parsers.DocumentBuilder; 035 import javax.xml.parsers.DocumentBuilderFactory; 036 import javax.xml.parsers.ParserConfigurationException; 037 import javax.xml.transform.Source; 038 import javax.xml.transform.Transformer; 039 import javax.xml.transform.TransformerException; 040 import javax.xml.transform.TransformerFactory; 041 import javax.xml.transform.dom.DOMResult; 042 import javax.xml.transform.dom.DOMSource; 043 import javax.xml.transform.stream.StreamSource; 044 045 import net.sourceforge.jeuclid.ResourceEntityResolver; 046 047 import org.apache.commons.logging.Log; 048 import org.apache.commons.logging.LogFactory; 049 import org.apache.xmlgraphics.image.loader.ImageSource; 050 import org.w3c.dom.Document; 051 import org.w3c.dom.Node; 052 import org.xml.sax.ErrorHandler; 053 import org.xml.sax.InputSource; 054 import org.xml.sax.SAXException; 055 import org.xml.sax.SAXParseException; 056 057 /** 058 * A JAXP compatible approach to MathML Parsing. 059 * 060 * @version $Revision: 2bab6eb875e8 $ 061 */ 062 // CHECKSTYLE:OFF 063 // This class is too complex. 064 @ThreadSafe 065 public final class Parser { 066 // CHECKSTYLE:ON 067 068 private static final class LoggerErrorHandler implements ErrorHandler { 069 public LoggerErrorHandler() { 070 // Empty on purpose 071 } 072 073 public void error(final SAXParseException exception) 074 throws SAXException { 075 Parser.LOGGER.warn(exception); 076 } 077 078 public void fatalError(final SAXParseException exception) 079 throws SAXException { 080 throw exception; 081 } 082 083 public void warning(final SAXParseException exception) 084 throws SAXException { 085 Parser.LOGGER.debug(exception); 086 } 087 } 088 089 private static final class UnclosableInputStream extends FilterInputStream { 090 protected UnclosableInputStream(final InputStream in) { 091 super(in); 092 } 093 094 @Override 095 public void close() throws IOException { 096 // Do Nothing. 097 } 098 } 099 100 /** 101 * Detection buffer size. Rationale: After the first 128 bytes a XML file 102 * and a ZIP file should be distinguishable. 103 */ 104 private static final int DETECTION_BUFFER_SIZE = 128; 105 106 private static final String BAD_STREAM_SOURCE = "Bad StreamSource: "; 107 108 private static final String CONTENT_XML = "content.xml"; 109 110 private static final String CANNOT_HANDLE_SOURCE = "Cannot handle Source: "; 111 112 private static final class SingletonHolder { 113 private static final Parser INSTANCE = new Parser(); 114 115 private SingletonHolder() { 116 } 117 } 118 119 /** 120 * Logger for this class. 121 */ 122 private static final Log LOGGER = LogFactory.getLog(Parser.class); 123 124 private final Map<Long, Reference<DocumentBuilder>> builders; 125 126 /** 127 * Default constructor. 128 */ 129 protected Parser() { 130 this.builders = new ConcurrentHashMap<Long, Reference<DocumentBuilder>>(); 131 } 132 133 private DocumentBuilder createDocumentBuilder() { 134 DocumentBuilder documentBuilder; 135 try { 136 try { 137 documentBuilder = this.tryCreateDocumentBuilder(true); 138 } catch (final UnsupportedOperationException uoe) { 139 Parser.LOGGER.debug("Unsupported Operation: " 140 + uoe.getMessage()); 141 documentBuilder = this.tryCreateDocumentBuilder(false); 142 } catch (final ParserConfigurationException pce) { 143 Parser.LOGGER.debug("ParserConfigurationException: " 144 + pce.getMessage()); 145 documentBuilder = this.tryCreateDocumentBuilder(false); 146 } 147 documentBuilder.setEntityResolver(new ResourceEntityResolver()); 148 documentBuilder.setErrorHandler(new LoggerErrorHandler()); 149 } catch (final ParserConfigurationException pce2) { 150 Parser.LOGGER.warn("Could not create Parser: " + pce2.getMessage()); 151 assert false : "Could not create Parser"; 152 documentBuilder = null; 153 } 154 return documentBuilder; 155 } 156 157 private DocumentBuilder tryCreateDocumentBuilder(final boolean xinclude) 158 throws ParserConfigurationException { 159 final DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory 160 .newInstance(); 161 documentBuilderFactory.setNamespaceAware(true); 162 if (xinclude) { 163 documentBuilderFactory.setXIncludeAware(true); 164 } 165 final DocumentBuilder documentBuilder = documentBuilderFactory 166 .newDocumentBuilder(); 167 return documentBuilder; 168 } 169 170 /** 171 * Retrieve the singleton Parser instance. 172 * 173 * @return a Parser object. 174 */ 175 public static Parser getInstance() { 176 return Parser.SingletonHolder.INSTANCE; 177 } 178 179 /** 180 * use {@link #getInstance()} instead. 181 * 182 * @return see {@link #getInstance()} 183 * @throws ParserConfigurationException 184 * see {@link #getInstance()} 185 * @deprecated use {@link #getInstance()} instead. 186 */ 187 @Deprecated 188 public static Parser getParser() throws ParserConfigurationException { 189 return Parser.getInstance(); 190 } 191 192 /** 193 * Parse a StreamSource and return its Document. 194 * <p> 195 * This method will auto-detect ODF or XML format and load an appropriate 196 * parser. 197 * 198 * @param streamSource 199 * A StreamSource. 200 * @return A DOM Document representation for this source. 201 * @throws SAXException 202 * if a parse error occurred. 203 * @throws IOException 204 * if an I/O error occurred. 205 */ 206 public Document parseStreamSource(final StreamSource streamSource) 207 throws SAXException, IOException { 208 Document retVal = null; 209 InputStream inputStream = streamSource.getInputStream(); 210 if (inputStream != null) { 211 212 // Alternative 1: Parse as XML, and fall back to ODF 213 if (!inputStream.markSupported()) { 214 inputStream = new BufferedInputStream(inputStream); 215 } 216 final InputStream filterInput = new UnclosableInputStream( 217 inputStream); 218 filterInput.mark(Parser.DETECTION_BUFFER_SIZE); 219 try { 220 retVal = this.parseStreamSourceAsXml(new StreamSource( 221 filterInput)); 222 inputStream.close(); 223 } catch (final SAXParseException se) { 224 filterInput.reset(); 225 try { 226 retVal = this.parseStreamSourceAsOdf(new StreamSource( 227 filterInput)); 228 } catch (final IOException io) { 229 throw se; 230 } 231 inputStream.close(); 232 } 233 234 // Alternative 2: peek for ZIP magic and call matching parser. 235 236 // final PushbackInputStream pi = new PushbackInputStream( 237 // inputStream, 4); 238 // final byte[] magic = new byte[4]; 239 // pi.read(magic); 240 // pi.unread(magic); 241 // if ((magic[0] == 'P') && (magic[1] == 'K') && (magic[2] == 3) 242 // && (magic[3] == 4)) { 243 // retVal = this.parseStreamSourceAsOdf(streamSource); 244 // } 245 } 246 if (retVal == null) { 247 retVal = this.parseStreamSourceAsXml(streamSource); 248 } 249 return retVal; 250 } 251 252 /** 253 * Parse a given StreamSource which represents an ODF document. 254 * 255 * @param streamSource 256 * StreamSource to parse. 257 * @return the Document contained within. 258 * @throws SAXException 259 * if a parse error occurred. 260 * @throws IOException 261 * if an I/O error occurred. 262 */ 263 public Document parseStreamSourceAsOdf(final StreamSource streamSource) 264 throws IOException, SAXException { 265 final InputStream is = streamSource.getInputStream(); 266 if (is == null) { 267 throw new IllegalArgumentException(Parser.BAD_STREAM_SOURCE 268 + streamSource); 269 } 270 final ZipInputStream zipStream = new ZipInputStream(is); 271 Document document = null; 272 ZipEntry entry = zipStream.getNextEntry(); 273 while (entry != null) { 274 if (Parser.CONTENT_XML.equals(entry.getName())) { 275 document = this.getDocumentBuilder().parse(zipStream); 276 entry = null; 277 } else { 278 entry = zipStream.getNextEntry(); 279 } 280 } 281 return document; 282 } 283 284 /** 285 * Parse a given StreamSource which represents an XML document. 286 * 287 * @param streamSource 288 * StreamSource to parse. 289 * @return the Document contained within. 290 * @throws SAXException 291 * if a parse error occurred. 292 * @throws IOException 293 * if an I/O error occurred. 294 */ 295 public Document parseStreamSourceAsXml(final StreamSource streamSource) 296 throws SAXException, IOException { 297 InputSource inp = null; 298 final String systemId = streamSource.getSystemId(); 299 if (systemId != null) { 300 inp = new InputSource(systemId); 301 } 302 final InputStream is = streamSource.getInputStream(); 303 if ((inp == null) && (is != null)) { 304 inp = new InputSource(is); 305 } 306 final Reader ir = streamSource.getReader(); 307 if ((inp == null) && (ir != null)) { 308 inp = new InputSource(ir); 309 } 310 311 if (inp == null) { 312 throw new IllegalArgumentException(Parser.BAD_STREAM_SOURCE 313 + streamSource); 314 } 315 316 return this.getDocumentBuilder().parse(inp); 317 } 318 319 /** 320 * Retrieve a DocumentBuilder suitable for MathML parsing. 321 * <p> 322 * Please note: 323 * <ul> 324 * <li>There is one instance of the builder per thread.</li> 325 * <li>The builder instance is not thread safe, so it may not be passed 326 * among threads.</li> 327 * <li>Multiple Threads may call getDocumentBuilder concurrently</li> 328 * </ul> 329 * 330 * @return a DocumentBuilder 331 */ 332 public DocumentBuilder getDocumentBuilder() { 333 // Note: No synchronization needed, as id will be different for every 334 // thread! 335 final long id = Thread.currentThread().getId(); 336 final Reference<DocumentBuilder> builderRef = this.builders.get(id); 337 if (builderRef != null) { 338 final DocumentBuilder builder = builderRef.get(); 339 if (builder != null) { 340 return builder; 341 } 342 } 343 final DocumentBuilder builder = this.createDocumentBuilder(); 344 this.builders.put(id, new SoftReference<DocumentBuilder>(builder)); 345 return builder; 346 } 347 348 /** 349 * Extract the top Node from a given Source. 350 * 351 * @param source 352 * the Source to use. Currently supported are {@link DOMSource} , 353 * {@link StreamSource} 354 * @return the top NODE. 355 * @throws SAXException 356 * if a parse error occurred. 357 * @throws IOException 358 * if an I/O error occurred. 359 */ 360 public Node parse(final Source source) throws SAXException, IOException { 361 final Node retVal; 362 if (source instanceof StreamSource) { 363 final StreamSource streamSource = (StreamSource) source; 364 retVal = this.parseStreamSource(streamSource); 365 } else if (source instanceof ImageSource) { 366 final ImageSource imageSource = (ImageSource) source; 367 final StreamSource streamSource = new StreamSource(imageSource 368 .getInputStream()); 369 retVal = this.parseStreamSource(streamSource); 370 } else if (source instanceof DOMSource) { 371 final DOMSource domSource = (DOMSource) source; 372 retVal = domSource.getNode(); 373 } else { 374 try { 375 final Transformer t = TransformerFactory.newInstance() 376 .newTransformer(); 377 final DOMResult r = new DOMResult(); 378 t.transform(source, r); 379 retVal = r.getNode(); 380 } catch (final TransformerException e) { 381 Parser.LOGGER.warn(e.getMessage()); 382 throw new IllegalArgumentException(Parser.CANNOT_HANDLE_SOURCE 383 + source, e); 384 } 385 } 386 return retVal; 387 } 388 }