001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.nio.ByteBuffer;
025    import java.nio.CharBuffer;
026    import java.nio.charset.CharacterCodingException;
027    import java.nio.charset.Charset;
028    import java.nio.charset.CharsetDecoder;
029    import java.nio.charset.CharsetEncoder;
030    import java.nio.charset.CodingErrorAction;
031    import java.nio.charset.MalformedInputException;
032    import java.text.CharacterIterator;
033    import java.text.StringCharacterIterator;
034    import java.util.Arrays;
035    
036    import org.apache.avro.reflect.Stringable;
037    
038    import org.apache.hadoop.classification.InterfaceAudience;
039    import org.apache.hadoop.classification.InterfaceStability;
040    
041    /** This class stores text using standard UTF8 encoding.  It provides methods
042     * to serialize, deserialize, and compare texts at byte level.  The type of
043     * length is integer and is serialized using zero-compressed format.  <p>In
044     * addition, it provides methods for string traversal without converting the
045     * byte array to a string.  <p>Also includes utilities for
046     * serializing/deserialing a string, coding/decoding a string, checking if a
047     * byte array contains valid UTF8 code, calculating the length of an encoded
048     * string.
049     */
050    @Stringable
051    @InterfaceAudience.Public
052    @InterfaceStability.Stable
053    public class Text extends BinaryComparable
054        implements WritableComparable<BinaryComparable> {
055      
056      static final int SHORT_STRING_MAX = 1024 * 1024;
057      
058      private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
059        new ThreadLocal<CharsetEncoder>() {
060          protected CharsetEncoder initialValue() {
061            return Charset.forName("UTF-8").newEncoder().
062                   onMalformedInput(CodingErrorAction.REPORT).
063                   onUnmappableCharacter(CodingErrorAction.REPORT);
064        }
065      };
066      
067      private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
068        new ThreadLocal<CharsetDecoder>() {
069        protected CharsetDecoder initialValue() {
070          return Charset.forName("UTF-8").newDecoder().
071                 onMalformedInput(CodingErrorAction.REPORT).
072                 onUnmappableCharacter(CodingErrorAction.REPORT);
073        }
074      };
075      
076      private static final byte [] EMPTY_BYTES = new byte[0];
077      
078      private byte[] bytes;
079      private int length;
080    
081      public Text() {
082        bytes = EMPTY_BYTES;
083      }
084    
085      /** Construct from a string. 
086       */
087      public Text(String string) {
088        set(string);
089      }
090    
091      /** Construct from another text. */
092      public Text(Text utf8) {
093        set(utf8);
094      }
095    
096      /** Construct from a byte array.
097       */
098      public Text(byte[] utf8)  {
099        set(utf8);
100      }
101      
102      /**
103       * Get a copy of the bytes that is exactly the length of the data.
104       * See {@link #getBytes()} for faster access to the underlying array.
105       */
106      public byte[] copyBytes() {
107        byte[] result = new byte[length];
108        System.arraycopy(bytes, 0, result, 0, length);
109        return result;
110      }
111      
112      /**
113       * Returns the raw bytes; however, only data up to {@link #getLength()} is
114       * valid. Please use {@link #copyBytes()} if you
115       * need the returned array to be precisely the length of the data.
116       */
117      public byte[] getBytes() {
118        return bytes;
119      }
120    
121      /** Returns the number of bytes in the byte array */ 
122      public int getLength() {
123        return length;
124      }
125      
126      /**
127       * Returns the Unicode Scalar Value (32-bit integer value)
128       * for the character at <code>position</code>. Note that this
129       * method avoids using the converter or doing String instatiation
130       * @return the Unicode scalar value at position or -1
131       *          if the position is invalid or points to a
132       *          trailing byte
133       */
134      public int charAt(int position) {
135        if (position > this.length) return -1; // too long
136        if (position < 0) return -1; // duh.
137          
138        ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
139        return bytesToCodePoint(bb.slice());
140      }
141      
142      public int find(String what) {
143        return find(what, 0);
144      }
145      
146      /**
147       * Finds any occurence of <code>what</code> in the backing
148       * buffer, starting as position <code>start</code>. The starting
149       * position is measured in bytes and the return value is in
150       * terms of byte position in the buffer. The backing buffer is
151       * not converted to a string for this operation.
152       * @return byte position of the first occurence of the search
153       *         string in the UTF-8 buffer or -1 if not found
154       */
155      public int find(String what, int start) {
156        try {
157          ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
158          ByteBuffer tgt = encode(what);
159          byte b = tgt.get();
160          src.position(start);
161              
162          while (src.hasRemaining()) {
163            if (b == src.get()) { // matching first byte
164              src.mark(); // save position in loop
165              tgt.mark(); // save position in target
166              boolean found = true;
167              int pos = src.position()-1;
168              while (tgt.hasRemaining()) {
169                if (!src.hasRemaining()) { // src expired first
170                  tgt.reset();
171                  src.reset();
172                  found = false;
173                  break;
174                }
175                if (!(tgt.get() == src.get())) {
176                  tgt.reset();
177                  src.reset();
178                  found = false;
179                  break; // no match
180                }
181              }
182              if (found) return pos;
183            }
184          }
185          return -1; // not found
186        } catch (CharacterCodingException e) {
187          // can't get here
188          e.printStackTrace();
189          return -1;
190        }
191      }  
192      /** Set to contain the contents of a string. 
193       */
194      public void set(String string) {
195        try {
196          ByteBuffer bb = encode(string, true);
197          bytes = bb.array();
198          length = bb.limit();
199        }catch(CharacterCodingException e) {
200          throw new RuntimeException("Should not have happened ", e); 
201        }
202      }
203    
204      /** Set to a utf8 byte array
205       */
206      public void set(byte[] utf8) {
207        set(utf8, 0, utf8.length);
208      }
209      
210      /** copy a text. */
211      public void set(Text other) {
212        set(other.getBytes(), 0, other.getLength());
213      }
214    
215      /**
216       * Set the Text to range of bytes
217       * @param utf8 the data to copy from
218       * @param start the first position of the new string
219       * @param len the number of bytes of the new string
220       */
221      public void set(byte[] utf8, int start, int len) {
222        setCapacity(len, false);
223        System.arraycopy(utf8, start, bytes, 0, len);
224        this.length = len;
225      }
226    
227      /**
228       * Append a range of bytes to the end of the given text
229       * @param utf8 the data to copy from
230       * @param start the first position to append from utf8
231       * @param len the number of bytes to append
232       */
233      public void append(byte[] utf8, int start, int len) {
234        setCapacity(length + len, true);
235        System.arraycopy(utf8, start, bytes, length, len);
236        length += len;
237      }
238    
239      /**
240       * Clear the string to empty.
241       */
242      public void clear() {
243        length = 0;
244      }
245    
246      /*
247       * Sets the capacity of this Text object to <em>at least</em>
248       * <code>len</code> bytes. If the current buffer is longer,
249       * then the capacity and existing content of the buffer are
250       * unchanged. If <code>len</code> is larger
251       * than the current capacity, the Text object's capacity is
252       * increased to match.
253       * @param len the number of bytes we need
254       * @param keepData should the old data be kept
255       */
256      private void setCapacity(int len, boolean keepData) {
257        if (bytes == null || bytes.length < len) {
258          if (bytes != null && keepData) {
259            bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
260          } else {
261            bytes = new byte[len];
262          }
263        }
264      }
265       
266      /** 
267       * Convert text back to string
268       * @see java.lang.Object#toString()
269       */
270      @Override
271      public String toString() {
272        try {
273          return decode(bytes, 0, length);
274        } catch (CharacterCodingException e) { 
275          throw new RuntimeException("Should not have happened " , e); 
276        }
277      }
278      
279      /** deserialize 
280       */
281      public void readFields(DataInput in) throws IOException {
282        int newLength = WritableUtils.readVInt(in);
283        setCapacity(newLength, false);
284        in.readFully(bytes, 0, newLength);
285        length = newLength;
286      }
287      
288      public void readFields(DataInput in, int maxLength) throws IOException {
289        int newLength = WritableUtils.readVInt(in);
290        if (newLength < 0) {
291          throw new IOException("tried to deserialize " + newLength +
292              " bytes of data!  newLength must be non-negative.");
293        } else if (newLength >= maxLength) {
294          throw new IOException("tried to deserialize " + newLength +
295              " bytes of data, but maxLength = " + maxLength);
296        }
297        setCapacity(newLength, false);
298        in.readFully(bytes, 0, newLength);
299        length = newLength;
300      }
301    
302      /** Skips over one Text in the input. */
303      public static void skip(DataInput in) throws IOException {
304        int length = WritableUtils.readVInt(in);
305        WritableUtils.skipFully(in, length);
306      }
307    
308      /** serialize
309       * write this object to out
310       * length uses zero-compressed encoding
311       * @see Writable#write(DataOutput)
312       */
313      public void write(DataOutput out) throws IOException {
314        WritableUtils.writeVInt(out, length);
315        out.write(bytes, 0, length);
316      }
317    
318      public void write(DataOutput out, int maxLength) throws IOException {
319        if (length > maxLength) {
320          throw new IOException("data was too long to write!  Expected " +
321              "less than or equal to " + maxLength + " bytes, but got " +
322              length + " bytes.");
323        }
324        WritableUtils.writeVInt(out, length);
325        out.write(bytes, 0, length);
326      }
327    
328      /** Returns true iff <code>o</code> is a Text with the same contents.  */
329      public boolean equals(Object o) {
330        if (o instanceof Text)
331          return super.equals(o);
332        return false;
333      }
334    
335      @Override
336      public int hashCode() {
337        return super.hashCode();
338      }
339    
340      /** A WritableComparator optimized for Text keys. */
341      public static class Comparator extends WritableComparator {
342        public Comparator() {
343          super(Text.class);
344        }
345    
346        public int compare(byte[] b1, int s1, int l1,
347                           byte[] b2, int s2, int l2) {
348          int n1 = WritableUtils.decodeVIntSize(b1[s1]);
349          int n2 = WritableUtils.decodeVIntSize(b2[s2]);
350          return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
351        }
352      }
353    
354      static {
355        // register this comparator
356        WritableComparator.define(Text.class, new Comparator());
357      }
358    
359      /// STATIC UTILITIES FROM HERE DOWN
360      /**
361       * Converts the provided byte array to a String using the
362       * UTF-8 encoding. If the input is malformed,
363       * replace by a default value.
364       */
365      public static String decode(byte[] utf8) throws CharacterCodingException {
366        return decode(ByteBuffer.wrap(utf8), true);
367      }
368      
369      public static String decode(byte[] utf8, int start, int length) 
370        throws CharacterCodingException {
371        return decode(ByteBuffer.wrap(utf8, start, length), true);
372      }
373      
374      /**
375       * Converts the provided byte array to a String using the
376       * UTF-8 encoding. If <code>replace</code> is true, then
377       * malformed input is replaced with the
378       * substitution character, which is U+FFFD. Otherwise the
379       * method throws a MalformedInputException.
380       */
381      public static String decode(byte[] utf8, int start, int length, boolean replace) 
382        throws CharacterCodingException {
383        return decode(ByteBuffer.wrap(utf8, start, length), replace);
384      }
385      
386      private static String decode(ByteBuffer utf8, boolean replace) 
387        throws CharacterCodingException {
388        CharsetDecoder decoder = DECODER_FACTORY.get();
389        if (replace) {
390          decoder.onMalformedInput(
391              java.nio.charset.CodingErrorAction.REPLACE);
392          decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
393        }
394        String str = decoder.decode(utf8).toString();
395        // set decoder back to its default value: REPORT
396        if (replace) {
397          decoder.onMalformedInput(CodingErrorAction.REPORT);
398          decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
399        }
400        return str;
401      }
402    
403      /**
404       * Converts the provided String to bytes using the
405       * UTF-8 encoding. If the input is malformed,
406       * invalid chars are replaced by a default value.
407       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
408       *                     and length is ByteBuffer.limit()
409       */
410    
411      public static ByteBuffer encode(String string)
412        throws CharacterCodingException {
413        return encode(string, true);
414      }
415    
416      /**
417       * Converts the provided String to bytes using the
418       * UTF-8 encoding. If <code>replace</code> is true, then
419       * malformed input is replaced with the
420       * substitution character, which is U+FFFD. Otherwise the
421       * method throws a MalformedInputException.
422       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
423       *                     and length is ByteBuffer.limit()
424       */
425      public static ByteBuffer encode(String string, boolean replace)
426        throws CharacterCodingException {
427        CharsetEncoder encoder = ENCODER_FACTORY.get();
428        if (replace) {
429          encoder.onMalformedInput(CodingErrorAction.REPLACE);
430          encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
431        }
432        ByteBuffer bytes = 
433          encoder.encode(CharBuffer.wrap(string.toCharArray()));
434        if (replace) {
435          encoder.onMalformedInput(CodingErrorAction.REPORT);
436          encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
437        }
438        return bytes;
439      }
440    
441      static final public int DEFAULT_MAX_LEN = 1024 * 1024;
442    
443      /** Read a UTF8 encoded string from in
444       */
445      public static String readString(DataInput in) throws IOException {
446        int length = WritableUtils.readVInt(in);
447        byte [] bytes = new byte[length];
448        in.readFully(bytes, 0, length);
449        return decode(bytes);
450      }
451      
452      /** Read a UTF8 encoded string with a maximum size
453       */
454      public static String readString(DataInput in, int maxLength)
455          throws IOException {
456        int length = WritableUtils.readVIntInRange(in, 0, maxLength);
457        byte [] bytes = new byte[length];
458        in.readFully(bytes, 0, length);
459        return decode(bytes);
460      }
461      
462      /** Write a UTF8 encoded string to out
463       */
464      public static int writeString(DataOutput out, String s) throws IOException {
465        ByteBuffer bytes = encode(s);
466        int length = bytes.limit();
467        WritableUtils.writeVInt(out, length);
468        out.write(bytes.array(), 0, length);
469        return length;
470      }
471    
472      /** Write a UTF8 encoded string with a maximum size to out
473       */
474      public static int writeString(DataOutput out, String s, int maxLength)
475          throws IOException {
476        ByteBuffer bytes = encode(s);
477        int length = bytes.limit();
478        if (length > maxLength) {
479          throw new IOException("string was too long to write!  Expected " +
480              "less than or equal to " + maxLength + " bytes, but got " +
481              length + " bytes.");
482        }
483        WritableUtils.writeVInt(out, length);
484        out.write(bytes.array(), 0, length);
485        return length;
486      }
487    
488      ////// states for validateUTF8
489      
490      private static final int LEAD_BYTE = 0;
491    
492      private static final int TRAIL_BYTE_1 = 1;
493    
494      private static final int TRAIL_BYTE = 2;
495    
496      /** 
497       * Check if a byte array contains valid utf-8
498       * @param utf8 byte array
499       * @throws MalformedInputException if the byte array contains invalid utf-8
500       */
501      public static void validateUTF8(byte[] utf8) throws MalformedInputException {
502        validateUTF8(utf8, 0, utf8.length);     
503      }
504      
505      /**
506       * Check to see if a byte array is valid utf-8
507       * @param utf8 the array of bytes
508       * @param start the offset of the first byte in the array
509       * @param len the length of the byte sequence
510       * @throws MalformedInputException if the byte array contains invalid bytes
511       */
512      public static void validateUTF8(byte[] utf8, int start, int len)
513        throws MalformedInputException {
514        int count = start;
515        int leadByte = 0;
516        int length = 0;
517        int state = LEAD_BYTE;
518        while (count < start+len) {
519          int aByte = ((int) utf8[count] & 0xFF);
520    
521          switch (state) {
522          case LEAD_BYTE:
523            leadByte = aByte;
524            length = bytesFromUTF8[aByte];
525    
526            switch (length) {
527            case 0: // check for ASCII
528              if (leadByte > 0x7F)
529                throw new MalformedInputException(count);
530              break;
531            case 1:
532              if (leadByte < 0xC2 || leadByte > 0xDF)
533                throw new MalformedInputException(count);
534              state = TRAIL_BYTE_1;
535              break;
536            case 2:
537              if (leadByte < 0xE0 || leadByte > 0xEF)
538                throw new MalformedInputException(count);
539              state = TRAIL_BYTE_1;
540              break;
541            case 3:
542              if (leadByte < 0xF0 || leadByte > 0xF4)
543                throw new MalformedInputException(count);
544              state = TRAIL_BYTE_1;
545              break;
546            default:
547              // too long! Longest valid UTF-8 is 4 bytes (lead + three)
548              // or if < 0 we got a trail byte in the lead byte position
549              throw new MalformedInputException(count);
550            } // switch (length)
551            break;
552    
553          case TRAIL_BYTE_1:
554            if (leadByte == 0xF0 && aByte < 0x90)
555              throw new MalformedInputException(count);
556            if (leadByte == 0xF4 && aByte > 0x8F)
557              throw new MalformedInputException(count);
558            if (leadByte == 0xE0 && aByte < 0xA0)
559              throw new MalformedInputException(count);
560            if (leadByte == 0xED && aByte > 0x9F)
561              throw new MalformedInputException(count);
562            // falls through to regular trail-byte test!!
563          case TRAIL_BYTE:
564            if (aByte < 0x80 || aByte > 0xBF)
565              throw new MalformedInputException(count);
566            if (--length == 0) {
567              state = LEAD_BYTE;
568            } else {
569              state = TRAIL_BYTE;
570            }
571            break;
572          } // switch (state)
573          count++;
574        }
575      }
576    
577      /**
578       * Magic numbers for UTF-8. These are the number of bytes
579       * that <em>follow</em> a given lead byte. Trailing bytes
580       * have the value -1. The values 4 and 5 are presented in
581       * this table, even though valid UTF-8 cannot include the
582       * five and six byte sequences.
583       */
584      static final int[] bytesFromUTF8 =
585      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
586        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
587        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
588        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
589        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
590        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591        0, 0, 0, 0, 0, 0, 0,
592        // trail bytes
593        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
594        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
595        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
596        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
597        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
598        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
599        3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
600    
601      /**
602       * Returns the next code point at the current position in
603       * the buffer. The buffer's position will be incremented.
604       * Any mark set on this buffer will be changed by this method!
605       */
606      public static int bytesToCodePoint(ByteBuffer bytes) {
607        bytes.mark();
608        byte b = bytes.get();
609        bytes.reset();
610        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
611        if (extraBytesToRead < 0) return -1; // trailing byte!
612        int ch = 0;
613    
614        switch (extraBytesToRead) {
615        case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
616        case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
617        case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
618        case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
619        case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
620        case 0: ch += (bytes.get() & 0xFF);
621        }
622        ch -= offsetsFromUTF8[extraBytesToRead];
623    
624        return ch;
625      }
626    
627      
628      static final int offsetsFromUTF8[] =
629      { 0x00000000, 0x00003080,
630        0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
631    
632      /**
633       * For the given string, returns the number of UTF-8 bytes
634       * required to encode the string.
635       * @param string text to encode
636       * @return number of UTF-8 bytes required to encode
637       */
638      public static int utf8Length(String string) {
639        CharacterIterator iter = new StringCharacterIterator(string);
640        char ch = iter.first();
641        int size = 0;
642        while (ch != CharacterIterator.DONE) {
643          if ((ch >= 0xD800) && (ch < 0xDC00)) {
644            // surrogate pair?
645            char trail = iter.next();
646            if ((trail > 0xDBFF) && (trail < 0xE000)) {
647              // valid pair
648              size += 4;
649            } else {
650              // invalid pair
651              size += 3;
652              iter.previous(); // rewind one
653            }
654          } else if (ch < 0x80) {
655            size++;
656          } else if (ch < 0x800) {
657            size += 2;
658          } else {
659            // ch < 0x10000, that is, the largest char value
660            size += 3;
661          }
662          ch = iter.next();
663        }
664        return size;
665      }
666    }