001 /*
002 * Copyright 2007 - 2007 JEuclid, http://jeuclid.sf.net
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 /* $Id: Parser.java,v 2bab6eb875e8 2010/08/11 16:45:50 max $ */
018
019 package net.sourceforge.jeuclid.parser;
020
021 import java.io.BufferedInputStream;
022 import java.io.FilterInputStream;
023 import java.io.IOException;
024 import java.io.InputStream;
025 import java.io.Reader;
026 import java.lang.ref.Reference;
027 import java.lang.ref.SoftReference;
028 import java.util.Map;
029 import java.util.concurrent.ConcurrentHashMap;
030 import java.util.zip.ZipEntry;
031 import java.util.zip.ZipInputStream;
032
033 import javax.annotation.concurrent.ThreadSafe;
034 import javax.xml.parsers.DocumentBuilder;
035 import javax.xml.parsers.DocumentBuilderFactory;
036 import javax.xml.parsers.ParserConfigurationException;
037 import javax.xml.transform.Source;
038 import javax.xml.transform.Transformer;
039 import javax.xml.transform.TransformerException;
040 import javax.xml.transform.TransformerFactory;
041 import javax.xml.transform.dom.DOMResult;
042 import javax.xml.transform.dom.DOMSource;
043 import javax.xml.transform.stream.StreamSource;
044
045 import net.sourceforge.jeuclid.ResourceEntityResolver;
046
047 import org.apache.commons.logging.Log;
048 import org.apache.commons.logging.LogFactory;
049 import org.apache.xmlgraphics.image.loader.ImageSource;
050 import org.w3c.dom.Document;
051 import org.w3c.dom.Node;
052 import org.xml.sax.ErrorHandler;
053 import org.xml.sax.InputSource;
054 import org.xml.sax.SAXException;
055 import org.xml.sax.SAXParseException;
056
057 /**
058 * A JAXP compatible approach to MathML Parsing.
059 *
060 * @version $Revision: 2bab6eb875e8 $
061 */
062 // CHECKSTYLE:OFF
063 // This class is too complex.
064 @ThreadSafe
065 public final class Parser {
066 // CHECKSTYLE:ON
067
068 private static final class LoggerErrorHandler implements ErrorHandler {
069 public LoggerErrorHandler() {
070 // Empty on purpose
071 }
072
073 public void error(final SAXParseException exception)
074 throws SAXException {
075 Parser.LOGGER.warn(exception);
076 }
077
078 public void fatalError(final SAXParseException exception)
079 throws SAXException {
080 throw exception;
081 }
082
083 public void warning(final SAXParseException exception)
084 throws SAXException {
085 Parser.LOGGER.debug(exception);
086 }
087 }
088
089 private static final class UnclosableInputStream extends FilterInputStream {
090 protected UnclosableInputStream(final InputStream in) {
091 super(in);
092 }
093
094 @Override
095 public void close() throws IOException {
096 // Do Nothing.
097 }
098 }
099
100 /**
101 * Detection buffer size. Rationale: After the first 128 bytes a XML file
102 * and a ZIP file should be distinguishable.
103 */
104 private static final int DETECTION_BUFFER_SIZE = 128;
105
106 private static final String BAD_STREAM_SOURCE = "Bad StreamSource: ";
107
108 private static final String CONTENT_XML = "content.xml";
109
110 private static final String CANNOT_HANDLE_SOURCE = "Cannot handle Source: ";
111
112 private static final class SingletonHolder {
113 private static final Parser INSTANCE = new Parser();
114
115 private SingletonHolder() {
116 }
117 }
118
119 /**
120 * Logger for this class.
121 */
122 private static final Log LOGGER = LogFactory.getLog(Parser.class);
123
124 private final Map<Long, Reference<DocumentBuilder>> builders;
125
126 /**
127 * Default constructor.
128 */
129 protected Parser() {
130 this.builders = new ConcurrentHashMap<Long, Reference<DocumentBuilder>>();
131 }
132
133 private DocumentBuilder createDocumentBuilder() {
134 DocumentBuilder documentBuilder;
135 try {
136 try {
137 documentBuilder = this.tryCreateDocumentBuilder(true);
138 } catch (final UnsupportedOperationException uoe) {
139 Parser.LOGGER.debug("Unsupported Operation: "
140 + uoe.getMessage());
141 documentBuilder = this.tryCreateDocumentBuilder(false);
142 } catch (final ParserConfigurationException pce) {
143 Parser.LOGGER.debug("ParserConfigurationException: "
144 + pce.getMessage());
145 documentBuilder = this.tryCreateDocumentBuilder(false);
146 }
147 documentBuilder.setEntityResolver(new ResourceEntityResolver());
148 documentBuilder.setErrorHandler(new LoggerErrorHandler());
149 } catch (final ParserConfigurationException pce2) {
150 Parser.LOGGER.warn("Could not create Parser: " + pce2.getMessage());
151 assert false : "Could not create Parser";
152 documentBuilder = null;
153 }
154 return documentBuilder;
155 }
156
157 private DocumentBuilder tryCreateDocumentBuilder(final boolean xinclude)
158 throws ParserConfigurationException {
159 final DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory
160 .newInstance();
161 documentBuilderFactory.setNamespaceAware(true);
162 if (xinclude) {
163 documentBuilderFactory.setXIncludeAware(true);
164 }
165 final DocumentBuilder documentBuilder = documentBuilderFactory
166 .newDocumentBuilder();
167 return documentBuilder;
168 }
169
170 /**
171 * Retrieve the singleton Parser instance.
172 *
173 * @return a Parser object.
174 */
175 public static Parser getInstance() {
176 return Parser.SingletonHolder.INSTANCE;
177 }
178
179 /**
180 * use {@link #getInstance()} instead.
181 *
182 * @return see {@link #getInstance()}
183 * @throws ParserConfigurationException
184 * see {@link #getInstance()}
185 * @deprecated use {@link #getInstance()} instead.
186 */
187 @Deprecated
188 public static Parser getParser() throws ParserConfigurationException {
189 return Parser.getInstance();
190 }
191
192 /**
193 * Parse a StreamSource and return its Document.
194 * <p>
195 * This method will auto-detect ODF or XML format and load an appropriate
196 * parser.
197 *
198 * @param streamSource
199 * A StreamSource.
200 * @return A DOM Document representation for this source.
201 * @throws SAXException
202 * if a parse error occurred.
203 * @throws IOException
204 * if an I/O error occurred.
205 */
206 public Document parseStreamSource(final StreamSource streamSource)
207 throws SAXException, IOException {
208 Document retVal = null;
209 InputStream inputStream = streamSource.getInputStream();
210 if (inputStream != null) {
211
212 // Alternative 1: Parse as XML, and fall back to ODF
213 if (!inputStream.markSupported()) {
214 inputStream = new BufferedInputStream(inputStream);
215 }
216 final InputStream filterInput = new UnclosableInputStream(
217 inputStream);
218 filterInput.mark(Parser.DETECTION_BUFFER_SIZE);
219 try {
220 retVal = this.parseStreamSourceAsXml(new StreamSource(
221 filterInput));
222 inputStream.close();
223 } catch (final SAXParseException se) {
224 filterInput.reset();
225 try {
226 retVal = this.parseStreamSourceAsOdf(new StreamSource(
227 filterInput));
228 } catch (final IOException io) {
229 throw se;
230 }
231 inputStream.close();
232 }
233
234 // Alternative 2: peek for ZIP magic and call matching parser.
235
236 // final PushbackInputStream pi = new PushbackInputStream(
237 // inputStream, 4);
238 // final byte[] magic = new byte[4];
239 // pi.read(magic);
240 // pi.unread(magic);
241 // if ((magic[0] == 'P') && (magic[1] == 'K') && (magic[2] == 3)
242 // && (magic[3] == 4)) {
243 // retVal = this.parseStreamSourceAsOdf(streamSource);
244 // }
245 }
246 if (retVal == null) {
247 retVal = this.parseStreamSourceAsXml(streamSource);
248 }
249 return retVal;
250 }
251
252 /**
253 * Parse a given StreamSource which represents an ODF document.
254 *
255 * @param streamSource
256 * StreamSource to parse.
257 * @return the Document contained within.
258 * @throws SAXException
259 * if a parse error occurred.
260 * @throws IOException
261 * if an I/O error occurred.
262 */
263 public Document parseStreamSourceAsOdf(final StreamSource streamSource)
264 throws IOException, SAXException {
265 final InputStream is = streamSource.getInputStream();
266 if (is == null) {
267 throw new IllegalArgumentException(Parser.BAD_STREAM_SOURCE
268 + streamSource);
269 }
270 final ZipInputStream zipStream = new ZipInputStream(is);
271 Document document = null;
272 ZipEntry entry = zipStream.getNextEntry();
273 while (entry != null) {
274 if (Parser.CONTENT_XML.equals(entry.getName())) {
275 document = this.getDocumentBuilder().parse(zipStream);
276 entry = null;
277 } else {
278 entry = zipStream.getNextEntry();
279 }
280 }
281 return document;
282 }
283
284 /**
285 * Parse a given StreamSource which represents an XML document.
286 *
287 * @param streamSource
288 * StreamSource to parse.
289 * @return the Document contained within.
290 * @throws SAXException
291 * if a parse error occurred.
292 * @throws IOException
293 * if an I/O error occurred.
294 */
295 public Document parseStreamSourceAsXml(final StreamSource streamSource)
296 throws SAXException, IOException {
297 InputSource inp = null;
298 final String systemId = streamSource.getSystemId();
299 if (systemId != null) {
300 inp = new InputSource(systemId);
301 }
302 final InputStream is = streamSource.getInputStream();
303 if ((inp == null) && (is != null)) {
304 inp = new InputSource(is);
305 }
306 final Reader ir = streamSource.getReader();
307 if ((inp == null) && (ir != null)) {
308 inp = new InputSource(ir);
309 }
310
311 if (inp == null) {
312 throw new IllegalArgumentException(Parser.BAD_STREAM_SOURCE
313 + streamSource);
314 }
315
316 return this.getDocumentBuilder().parse(inp);
317 }
318
319 /**
320 * Retrieve a DocumentBuilder suitable for MathML parsing.
321 * <p>
322 * Please note:
323 * <ul>
324 * <li>There is one instance of the builder per thread.</li>
325 * <li>The builder instance is not thread safe, so it may not be passed
326 * among threads.</li>
327 * <li>Multiple Threads may call getDocumentBuilder concurrently</li>
328 * </ul>
329 *
330 * @return a DocumentBuilder
331 */
332 public DocumentBuilder getDocumentBuilder() {
333 // Note: No synchronization needed, as id will be different for every
334 // thread!
335 final long id = Thread.currentThread().getId();
336 final Reference<DocumentBuilder> builderRef = this.builders.get(id);
337 if (builderRef != null) {
338 final DocumentBuilder builder = builderRef.get();
339 if (builder != null) {
340 return builder;
341 }
342 }
343 final DocumentBuilder builder = this.createDocumentBuilder();
344 this.builders.put(id, new SoftReference<DocumentBuilder>(builder));
345 return builder;
346 }
347
348 /**
349 * Extract the top Node from a given Source.
350 *
351 * @param source
352 * the Source to use. Currently supported are {@link DOMSource} ,
353 * {@link StreamSource}
354 * @return the top NODE.
355 * @throws SAXException
356 * if a parse error occurred.
357 * @throws IOException
358 * if an I/O error occurred.
359 */
360 public Node parse(final Source source) throws SAXException, IOException {
361 final Node retVal;
362 if (source instanceof StreamSource) {
363 final StreamSource streamSource = (StreamSource) source;
364 retVal = this.parseStreamSource(streamSource);
365 } else if (source instanceof ImageSource) {
366 final ImageSource imageSource = (ImageSource) source;
367 final StreamSource streamSource = new StreamSource(imageSource
368 .getInputStream());
369 retVal = this.parseStreamSource(streamSource);
370 } else if (source instanceof DOMSource) {
371 final DOMSource domSource = (DOMSource) source;
372 retVal = domSource.getNode();
373 } else {
374 try {
375 final Transformer t = TransformerFactory.newInstance()
376 .newTransformer();
377 final DOMResult r = new DOMResult();
378 t.transform(source, r);
379 retVal = r.getNode();
380 } catch (final TransformerException e) {
381 Parser.LOGGER.warn(e.getMessage());
382 throw new IllegalArgumentException(Parser.CANNOT_HANDLE_SOURCE
383 + source, e);
384 }
385 }
386 return retVal;
387 }
388 }