| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /* |
| * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $ |
| */ |
| package org.apache.xml.serializer; |
| |
| |
| /** |
| * Holds information about a given encoding, which is the Java name for the |
| * encoding, the equivalent ISO name. |
| * <p> |
| * An object of this type has two useful methods |
| * <pre> |
| * isInEncoding(char ch); |
| * </pre> |
| * which can be called if the character is not the high one in |
| * a surrogate pair and: |
| * <pre> |
| * isInEncoding(char high, char low); |
| * </pre> |
| * which can be called if the two characters from a high/low surrogate pair. |
| * <p> |
| * An EncodingInfo object is a node in a binary search tree. Such a node |
| * will answer if a character is in the encoding, and do so for a given |
| * range of unicode values (<code>m_first</code> to |
| * <code>m_last</code>). It will handle a certain range of values |
| * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>). |
| * If the unicode point is before that explicit range, that is it |
| * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root |
| * of such a tree, m_before. Likewise for values in the range |
| * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code> |
| * <p> |
| * Actually figuring out if a code point is in the encoding is expensive. So the |
| * purpose of this tree is to cache such determinations, and not to build the |
| * entire tree of information at the start, but only build up as much of the |
| * tree as is used during the transformation. |
| * <p> |
| * This Class is not a public API, and should only be used internally within |
| * the serializer. |
| * <p> |
| * This class is not a public API. |
| * @xsl.usage internal |
| */ |
| public final class EncodingInfo extends Object |
| { |
| |
| /** |
| * Not all characters in an encoding are in on contiguous group, |
| * however there is a lowest contiguous group starting at '\u0001' |
| * and working up to m_highCharInContiguousGroup. |
| * <p> |
| * This is the char for which chars at or below this value are |
| * definately in the encoding, although for chars |
| * above this point they might be in the encoding. |
| * This exists for performance, especially for ASCII characters |
| * because for ASCII all chars in the range '\u0001' to '\u007F' |
| * are in the encoding. |
| * |
| */ |
| private final char m_highCharInContiguousGroup; |
| |
| /** |
| * The ISO encoding name. |
| */ |
| final String name; |
| |
| /** |
| * The name used by the Java convertor. |
| */ |
| final String javaName; |
| |
| /** |
| * A helper object that we can ask if a |
| * single char, or a surrogate UTF-16 pair |
| * of chars that form a single character, |
| * is in this encoding. |
| */ |
| private InEncoding m_encoding; |
| |
| /** |
| * This is not a public API. It returns true if the |
| * char in question is in the encoding. |
| * @param ch the char in question. |
| * <p> |
| * This method is not a public API. |
| * @xsl.usage internal |
| */ |
| public boolean isInEncoding(char ch) { |
| if (m_encoding == null) { |
| m_encoding = new EncodingImpl(); |
| |
| // One could put alternate logic in here to |
| // instantiate another object that implements the |
| // InEncoding interface. For example if the JRE is 1.4 or up |
| // we could have an object that uses JRE 1.4 methods |
| } |
| return m_encoding.isInEncoding(ch); |
| } |
| |
| /** |
| * This is not a public API. It returns true if the |
| * character formed by the high/low pair is in the encoding. |
| * @param high a char that the a high char of a high/low surrogate pair. |
| * @param low a char that is the low char of a high/low surrogate pair. |
| * <p> |
| * This method is not a public API. |
| * @xsl.usage internal |
| */ |
| public boolean isInEncoding(char high, char low) { |
| if (m_encoding == null) { |
| m_encoding = new EncodingImpl(); |
| |
| // One could put alternate logic in here to |
| // instantiate another object that implements the |
| // InEncoding interface. For example if the JRE is 1.4 or up |
| // we could have an object that uses JRE 1.4 methods |
| } |
| return m_encoding.isInEncoding(high, low); |
| } |
| |
| /** |
| * Create an EncodingInfo object based on the ISO name and Java name. |
| * If both parameters are null any character will be considered to |
| * be in the encoding. This is useful for when the serializer is in |
| * temporary output state, and has no assciated encoding. |
| * |
| * @param name reference to the ISO name. |
| * @param javaName reference to the Java encoding name. |
| * @param highChar The char for which characters at or below this value are |
| * definately in the |
| * encoding, although for characters above this point they might be in the encoding. |
| */ |
| public EncodingInfo(String name, String javaName, char highChar) |
| { |
| |
| this.name = name; |
| this.javaName = javaName; |
| this.m_highCharInContiguousGroup = highChar; |
| } |
| |
| |
| |
| /** |
| * A simple interface to isolate the implementation. |
| * We could also use some new JRE 1.4 methods in another implementation |
| * provided we use reflection with them. |
| * <p> |
| * This interface is not a public API, |
| * and should only be used internally within the serializer. |
| * @xsl.usage internal |
| */ |
| private interface InEncoding { |
| /** |
| * Returns true if the char is in the encoding |
| */ |
| public boolean isInEncoding(char ch); |
| /** |
| * Returns true if the high/low surrogate pair forms |
| * a character that is in the encoding. |
| */ |
| public boolean isInEncoding(char high, char low); |
| } |
| |
| /** |
| * This class implements the |
| */ |
| private class EncodingImpl implements InEncoding { |
| |
| |
| |
| public boolean isInEncoding(char ch1) { |
| final boolean ret; |
| int codePoint = Encodings.toCodePoint(ch1); |
| if (codePoint < m_explFirst) { |
| // The unicode value is before the range |
| // that we explictly manage, so we delegate the answer. |
| |
| // If we don't have an m_before object to delegate to, make one. |
| if (m_before == null) |
| m_before = |
| new EncodingImpl( |
| m_encoding, |
| m_first, |
| m_explFirst - 1, |
| codePoint); |
| ret = m_before.isInEncoding(ch1); |
| } else if (m_explLast < codePoint) { |
| // The unicode value is after the range |
| // that we explictly manage, so we delegate the answer. |
| |
| // If we don't have an m_after object to delegate to, make one. |
| if (m_after == null) |
| m_after = |
| new EncodingImpl( |
| m_encoding, |
| m_explLast + 1, |
| m_last, |
| codePoint); |
| ret = m_after.isInEncoding(ch1); |
| } else { |
| // The unicode value is in the range we explitly handle |
| final int idx = codePoint - m_explFirst; |
| |
| // If we already know the answer, just return it. |
| if (m_alreadyKnown[idx]) |
| ret = m_isInEncoding[idx]; |
| else { |
| // We don't know the answer, so find out, |
| // which may be expensive, then cache the answer |
| ret = inEncoding(ch1, m_encoding); |
| m_alreadyKnown[idx] = true; |
| m_isInEncoding[idx] = ret; |
| } |
| } |
| return ret; |
| } |
| |
| public boolean isInEncoding(char high, char low) { |
| final boolean ret; |
| int codePoint = Encodings.toCodePoint(high,low); |
| if (codePoint < m_explFirst) { |
| // The unicode value is before the range |
| // that we explictly manage, so we delegate the answer. |
| |
| // If we don't have an m_before object to delegate to, make one. |
| if (m_before == null) |
| m_before = |
| new EncodingImpl( |
| m_encoding, |
| m_first, |
| m_explFirst - 1, |
| codePoint); |
| ret = m_before.isInEncoding(high,low); |
| } else if (m_explLast < codePoint) { |
| // The unicode value is after the range |
| // that we explictly manage, so we delegate the answer. |
| |
| // If we don't have an m_after object to delegate to, make one. |
| if (m_after == null) |
| m_after = |
| new EncodingImpl( |
| m_encoding, |
| m_explLast + 1, |
| m_last, |
| codePoint); |
| ret = m_after.isInEncoding(high,low); |
| } else { |
| // The unicode value is in the range we explitly handle |
| final int idx = codePoint - m_explFirst; |
| |
| // If we already know the answer, just return it. |
| if (m_alreadyKnown[idx]) |
| ret = m_isInEncoding[idx]; |
| else { |
| // We don't know the answer, so find out, |
| // which may be expensive, then cache the answer |
| ret = inEncoding(high, low, m_encoding); |
| m_alreadyKnown[idx] = true; |
| m_isInEncoding[idx] = ret; |
| } |
| } |
| return ret; |
| } |
| |
| /** |
| * The encoding. |
| */ |
| final private String m_encoding; |
| /** |
| * m_first through m_last is the range of unicode |
| * values that this object will return an answer on. |
| * It may delegate to a similar object with a different |
| * range |
| */ |
| final private int m_first; |
| |
| /** |
| * m_explFirst through m_explLast is the range of unicode |
| * value that this object handles explicitly and does not |
| * delegate to a similar object. |
| */ |
| final private int m_explFirst; |
| final private int m_explLast; |
| final private int m_last; |
| |
| /** |
| * The object, of the same type as this one, |
| * that handles unicode values in a range before |
| * the range explictly handled by this object, and |
| * to which this object may delegate. |
| */ |
| private InEncoding m_before; |
| /** |
| * The object, of the same type as this one, |
| * that handles unicode values in a range after |
| * the range explictly handled by this object, and |
| * to which this object may delegate. |
| */ |
| private InEncoding m_after; |
| |
| /** |
| * The number of unicode values explicitly handled |
| * by a single EncodingInfo object. This value is |
| * tuneable, but is set to 128 because that covers the |
| * entire low range of ASCII type chars within a single |
| * object. |
| */ |
| private static final int RANGE = 128; |
| |
| /** |
| * A flag to record if we already know the answer |
| * for the given unicode value. |
| */ |
| final private boolean m_alreadyKnown[] = new boolean[RANGE]; |
| /** |
| * A table holding the answer on whether the given unicode |
| * value is in the encoding. |
| */ |
| final private boolean m_isInEncoding[] = new boolean[RANGE]; |
| |
| private EncodingImpl() { |
| // This object will answer whether any unicode value |
| // is in the encoding, it handles values 0 through Integer.MAX_VALUE |
| this(javaName, 0, Integer.MAX_VALUE, (char) 0); |
| } |
| |
| private EncodingImpl(String encoding, int first, int last, int codePoint) { |
| // Set the range of unicode values that this object manages |
| // either explicitly or implicitly. |
| m_first = first; |
| m_last = last; |
| |
| // Set the range of unicode values that this object |
| // explicitly manages |
| m_explFirst = codePoint; |
| m_explLast = codePoint + (RANGE-1); |
| |
| m_encoding = encoding; |
| |
| if (javaName != null) |
| { |
| // Some optimization. |
| if (0 <= m_explFirst && m_explFirst <= 127) { |
| // This particular EncodingImpl explicitly handles |
| // characters in the low range. |
| if ("UTF8".equals(javaName) |
| || "UTF-16".equals(javaName) |
| || "ASCII".equals(javaName) |
| || "US-ASCII".equals(javaName) |
| || "Unicode".equals(javaName) |
| || "UNICODE".equals(javaName) |
| || javaName.startsWith("ISO8859")) { |
| |
| // Not only does this EncodingImpl object explicitly |
| // handle chracters in the low range, it is |
| // also one that we know something about, without |
| // needing to call inEncoding(char ch, String encoding) |
| // for this low range |
| // |
| // By initializing the table ahead of time |
| // for these low values, we prevent the expensive |
| // inEncoding(char ch, String encoding) |
| // from being called, at least for these common |
| // encodings. |
| for (int unicode = 1; unicode < 127; unicode++) { |
| final int idx = unicode - m_explFirst; |
| if (0 <= idx && idx < RANGE) { |
| m_alreadyKnown[idx] = true; |
| m_isInEncoding[idx] = true; |
| } |
| } |
| } |
| } |
| |
| /* A little bit more than optimization. |
| * |
| * We will say that any character is in the encoding if |
| * we don't have an encoding. |
| * This is meaningful when the serializer is being used |
| * in temporary output state, where we are not writing to |
| * the final output tree. It is when writing to the |
| * final output tree that we need to worry about the output |
| * encoding |
| */ |
| if (javaName == null) { |
| for (int idx = 0; idx < m_alreadyKnown.length; idx++) { |
| m_alreadyKnown[idx] = true; |
| m_isInEncoding[idx] = true; |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * This is heart of the code that determines if a given character |
| * is in the given encoding. This method is probably expensive, |
| * and the answer should be cached. |
| * <p> |
| * This method is not a public API, |
| * and should only be used internally within the serializer. |
| * @param ch the char in question, that is not a high char of |
| * a high/low surrogate pair. |
| * @param encoding the Java name of the enocding. |
| * |
| * @xsl.usage internal |
| * |
| */ |
| private static boolean inEncoding(char ch, String encoding) { |
| boolean isInEncoding; |
| try { |
| char cArray[] = new char[1]; |
| cArray[0] = ch; |
| // Construct a String from the char |
| String s = new String(cArray); |
| // Encode the String into a sequence of bytes |
| // using the given, named charset. |
| byte[] bArray = s.getBytes(encoding); |
| isInEncoding = inEncoding(ch, bArray); |
| |
| } catch (Exception e) { |
| isInEncoding = false; |
| |
| // If for some reason the encoding is null, e.g. |
| // for a temporary result tree, we should just |
| // say that every character is in the encoding. |
| if (encoding == null) |
| isInEncoding = true; |
| } |
| return isInEncoding; |
| } |
| |
| /** |
| * This is heart of the code that determines if a given high/low |
| * surrogate pair forms a character that is in the given encoding. |
| * This method is probably expensive, and the answer should be cached. |
| * <p> |
| * This method is not a public API, |
| * and should only be used internally within the serializer. |
| * @param high the high char of |
| * a high/low surrogate pair. |
| * @param low the low char of a high/low surrogate pair. |
| * @param encoding the Java name of the encoding. |
| * |
| * @xsl.usage internal |
| * |
| */ |
| private static boolean inEncoding(char high, char low, String encoding) { |
| boolean isInEncoding; |
| try { |
| char cArray[] = new char[2]; |
| cArray[0] = high; |
| cArray[1] = low; |
| // Construct a String from the char |
| String s = new String(cArray); |
| // Encode the String into a sequence of bytes |
| // using the given, named charset. |
| byte[] bArray = s.getBytes(encoding); |
| isInEncoding = inEncoding(high,bArray); |
| } catch (Exception e) { |
| isInEncoding = false; |
| } |
| |
| return isInEncoding; |
| } |
| |
| /** |
| * This method is the core of determining if character |
| * is in the encoding. The method is not foolproof, because |
| * s.getBytes(encoding) has specified behavior only if the |
| * characters are in the specified encoding. However this |
| * method tries it's best. |
| * @param ch the char that was converted using getBytes, or |
| * the first char of a high/low pair that was converted. |
| * @param data the bytes written out by the call to s.getBytes(encoding); |
| * @return true if the character is in the encoding. |
| */ |
| private static boolean inEncoding(char ch, byte[] data) { |
| final boolean isInEncoding; |
| // If the string written out as data is not in the encoding, |
| // the output is not specified according to the documentation |
| // on the String.getBytes(encoding) method, |
| // but we do our best here. |
| if (data==null || data.length == 0) { |
| isInEncoding = false; |
| } |
| else { |
| if (data[0] == 0) |
| isInEncoding = false; |
| else if (data[0] == '?' && ch != '?') |
| isInEncoding = false; |
| /* |
| * else if (isJapanese) { |
| * // isJapanese is really |
| * // ( "EUC-JP".equals(javaName) |
| * // || "EUC_JP".equals(javaName) |
| * // || "SJIS".equals(javaName) ) |
| * |
| * // Work around some bugs in JRE for Japanese |
| * if(data[0] == 0x21) |
| * isInEncoding = false; |
| * else if (ch == 0xA5) |
| * isInEncoding = false; |
| * else |
| * isInEncoding = true; |
| * } |
| */ |
| |
| else { |
| // We don't know for sure, but it looks like it is in the encoding |
| isInEncoding = true; |
| } |
| } |
| return isInEncoding; |
| } |
| |
| /** |
| * This method exists for performance reasons. |
| * <p> |
| * Except for '\u0000', if a char is less than or equal to the value |
| * returned by this method then it in the encoding. |
| * <p> |
| * The characters in an encoding are not contiguous, however |
| * there is a lowest group of chars starting at '\u0001' upto and |
| * including the char returned by this method that are all in the encoding. |
| * So the char returned by this method essentially defines the lowest |
| * contiguous group. |
| * <p> |
| * chars above the value returned might be in the encoding, but |
| * chars at or below the value returned are definately in the encoding. |
| * <p> |
| * In any case however, the isInEncoding(char) method can be used |
| * regardless of the value of the char returned by this method. |
| * <p> |
| * If the value returned is '\u0000' it means that every character must be tested |
| * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)} |
| * for surrogate pairs. |
| * <p> |
| * This method is not a public API. |
| * @xsl.usage internal |
| */ |
| public final char getHighChar() { |
| return m_highCharInContiguousGroup; |
| } |
| |
| } |