001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.io; 020 021 import java.io.IOException; 022 import java.io.DataInput; 023 import java.io.DataOutput; 024 import java.nio.ByteBuffer; 025 import java.nio.CharBuffer; 026 import java.nio.charset.CharacterCodingException; 027 import java.nio.charset.Charset; 028 import java.nio.charset.CharsetDecoder; 029 import java.nio.charset.CharsetEncoder; 030 import java.nio.charset.CodingErrorAction; 031 import java.nio.charset.MalformedInputException; 032 import java.text.CharacterIterator; 033 import java.text.StringCharacterIterator; 034 import java.util.Arrays; 035 036 import org.apache.avro.reflect.Stringable; 037 038 import org.apache.hadoop.classification.InterfaceAudience; 039 import org.apache.hadoop.classification.InterfaceStability; 040 041 /** This class stores text using standard UTF8 encoding. It provides methods 042 * to serialize, deserialize, and compare texts at byte level. The type of 043 * length is integer and is serialized using zero-compressed format. <p>In 044 * addition, it provides methods for string traversal without converting the 045 * byte array to a string. <p>Also includes utilities for 046 * serializing/deserialing a string, coding/decoding a string, checking if a 047 * byte array contains valid UTF8 code, calculating the length of an encoded 048 * string. 049 */ 050 @Stringable 051 @InterfaceAudience.Public 052 @InterfaceStability.Stable 053 public class Text extends BinaryComparable 054 implements WritableComparable<BinaryComparable> { 055 056 static final int SHORT_STRING_MAX = 1024 * 1024; 057 058 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = 059 new ThreadLocal<CharsetEncoder>() { 060 protected CharsetEncoder initialValue() { 061 return Charset.forName("UTF-8").newEncoder(). 062 onMalformedInput(CodingErrorAction.REPORT). 063 onUnmappableCharacter(CodingErrorAction.REPORT); 064 } 065 }; 066 067 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = 068 new ThreadLocal<CharsetDecoder>() { 069 protected CharsetDecoder initialValue() { 070 return Charset.forName("UTF-8").newDecoder(). 071 onMalformedInput(CodingErrorAction.REPORT). 072 onUnmappableCharacter(CodingErrorAction.REPORT); 073 } 074 }; 075 076 private static final byte [] EMPTY_BYTES = new byte[0]; 077 078 private byte[] bytes; 079 private int length; 080 081 public Text() { 082 bytes = EMPTY_BYTES; 083 } 084 085 /** Construct from a string. 086 */ 087 public Text(String string) { 088 set(string); 089 } 090 091 /** Construct from another text. */ 092 public Text(Text utf8) { 093 set(utf8); 094 } 095 096 /** Construct from a byte array. 097 */ 098 public Text(byte[] utf8) { 099 set(utf8); 100 } 101 102 /** 103 * Get a copy of the bytes that is exactly the length of the data. 104 * See {@link #getBytes()} for faster access to the underlying array. 105 */ 106 public byte[] copyBytes() { 107 byte[] result = new byte[length]; 108 System.arraycopy(bytes, 0, result, 0, length); 109 return result; 110 } 111 112 /** 113 * Returns the raw bytes; however, only data up to {@link #getLength()} is 114 * valid. Please use {@link #copyBytes()} if you 115 * need the returned array to be precisely the length of the data. 116 */ 117 public byte[] getBytes() { 118 return bytes; 119 } 120 121 /** Returns the number of bytes in the byte array */ 122 public int getLength() { 123 return length; 124 } 125 126 /** 127 * Returns the Unicode Scalar Value (32-bit integer value) 128 * for the character at <code>position</code>. Note that this 129 * method avoids using the converter or doing String instatiation 130 * @return the Unicode scalar value at position or -1 131 * if the position is invalid or points to a 132 * trailing byte 133 */ 134 public int charAt(int position) { 135 if (position > this.length) return -1; // too long 136 if (position < 0) return -1; // duh. 137 138 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); 139 return bytesToCodePoint(bb.slice()); 140 } 141 142 public int find(String what) { 143 return find(what, 0); 144 } 145 146 /** 147 * Finds any occurence of <code>what</code> in the backing 148 * buffer, starting as position <code>start</code>. The starting 149 * position is measured in bytes and the return value is in 150 * terms of byte position in the buffer. The backing buffer is 151 * not converted to a string for this operation. 152 * @return byte position of the first occurence of the search 153 * string in the UTF-8 buffer or -1 if not found 154 */ 155 public int find(String what, int start) { 156 try { 157 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length); 158 ByteBuffer tgt = encode(what); 159 byte b = tgt.get(); 160 src.position(start); 161 162 while (src.hasRemaining()) { 163 if (b == src.get()) { // matching first byte 164 src.mark(); // save position in loop 165 tgt.mark(); // save position in target 166 boolean found = true; 167 int pos = src.position()-1; 168 while (tgt.hasRemaining()) { 169 if (!src.hasRemaining()) { // src expired first 170 tgt.reset(); 171 src.reset(); 172 found = false; 173 break; 174 } 175 if (!(tgt.get() == src.get())) { 176 tgt.reset(); 177 src.reset(); 178 found = false; 179 break; // no match 180 } 181 } 182 if (found) return pos; 183 } 184 } 185 return -1; // not found 186 } catch (CharacterCodingException e) { 187 // can't get here 188 e.printStackTrace(); 189 return -1; 190 } 191 } 192 /** Set to contain the contents of a string. 193 */ 194 public void set(String string) { 195 try { 196 ByteBuffer bb = encode(string, true); 197 bytes = bb.array(); 198 length = bb.limit(); 199 }catch(CharacterCodingException e) { 200 throw new RuntimeException("Should not have happened ", e); 201 } 202 } 203 204 /** Set to a utf8 byte array 205 */ 206 public void set(byte[] utf8) { 207 set(utf8, 0, utf8.length); 208 } 209 210 /** copy a text. */ 211 public void set(Text other) { 212 set(other.getBytes(), 0, other.getLength()); 213 } 214 215 /** 216 * Set the Text to range of bytes 217 * @param utf8 the data to copy from 218 * @param start the first position of the new string 219 * @param len the number of bytes of the new string 220 */ 221 public void set(byte[] utf8, int start, int len) { 222 setCapacity(len, false); 223 System.arraycopy(utf8, start, bytes, 0, len); 224 this.length = len; 225 } 226 227 /** 228 * Append a range of bytes to the end of the given text 229 * @param utf8 the data to copy from 230 * @param start the first position to append from utf8 231 * @param len the number of bytes to append 232 */ 233 public void append(byte[] utf8, int start, int len) { 234 setCapacity(length + len, true); 235 System.arraycopy(utf8, start, bytes, length, len); 236 length += len; 237 } 238 239 /** 240 * Clear the string to empty. 241 */ 242 public void clear() { 243 length = 0; 244 } 245 246 /* 247 * Sets the capacity of this Text object to <em>at least</em> 248 * <code>len</code> bytes. If the current buffer is longer, 249 * then the capacity and existing content of the buffer are 250 * unchanged. If <code>len</code> is larger 251 * than the current capacity, the Text object's capacity is 252 * increased to match. 253 * @param len the number of bytes we need 254 * @param keepData should the old data be kept 255 */ 256 private void setCapacity(int len, boolean keepData) { 257 if (bytes == null || bytes.length < len) { 258 if (bytes != null && keepData) { 259 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1)); 260 } else { 261 bytes = new byte[len]; 262 } 263 } 264 } 265 266 /** 267 * Convert text back to string 268 * @see java.lang.Object#toString() 269 */ 270 @Override 271 public String toString() { 272 try { 273 return decode(bytes, 0, length); 274 } catch (CharacterCodingException e) { 275 throw new RuntimeException("Should not have happened " , e); 276 } 277 } 278 279 /** deserialize 280 */ 281 public void readFields(DataInput in) throws IOException { 282 int newLength = WritableUtils.readVInt(in); 283 setCapacity(newLength, false); 284 in.readFully(bytes, 0, newLength); 285 length = newLength; 286 } 287 288 public void readFields(DataInput in, int maxLength) throws IOException { 289 int newLength = WritableUtils.readVInt(in); 290 if (newLength < 0) { 291 throw new IOException("tried to deserialize " + newLength + 292 " bytes of data! newLength must be non-negative."); 293 } else if (newLength >= maxLength) { 294 throw new IOException("tried to deserialize " + newLength + 295 " bytes of data, but maxLength = " + maxLength); 296 } 297 setCapacity(newLength, false); 298 in.readFully(bytes, 0, newLength); 299 length = newLength; 300 } 301 302 /** Skips over one Text in the input. */ 303 public static void skip(DataInput in) throws IOException { 304 int length = WritableUtils.readVInt(in); 305 WritableUtils.skipFully(in, length); 306 } 307 308 /** serialize 309 * write this object to out 310 * length uses zero-compressed encoding 311 * @see Writable#write(DataOutput) 312 */ 313 public void write(DataOutput out) throws IOException { 314 WritableUtils.writeVInt(out, length); 315 out.write(bytes, 0, length); 316 } 317 318 public void write(DataOutput out, int maxLength) throws IOException { 319 if (length > maxLength) { 320 throw new IOException("data was too long to write! Expected " + 321 "less than or equal to " + maxLength + " bytes, but got " + 322 length + " bytes."); 323 } 324 WritableUtils.writeVInt(out, length); 325 out.write(bytes, 0, length); 326 } 327 328 /** Returns true iff <code>o</code> is a Text with the same contents. */ 329 public boolean equals(Object o) { 330 if (o instanceof Text) 331 return super.equals(o); 332 return false; 333 } 334 335 @Override 336 public int hashCode() { 337 return super.hashCode(); 338 } 339 340 /** A WritableComparator optimized for Text keys. */ 341 public static class Comparator extends WritableComparator { 342 public Comparator() { 343 super(Text.class); 344 } 345 346 public int compare(byte[] b1, int s1, int l1, 347 byte[] b2, int s2, int l2) { 348 int n1 = WritableUtils.decodeVIntSize(b1[s1]); 349 int n2 = WritableUtils.decodeVIntSize(b2[s2]); 350 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 351 } 352 } 353 354 static { 355 // register this comparator 356 WritableComparator.define(Text.class, new Comparator()); 357 } 358 359 /// STATIC UTILITIES FROM HERE DOWN 360 /** 361 * Converts the provided byte array to a String using the 362 * UTF-8 encoding. If the input is malformed, 363 * replace by a default value. 364 */ 365 public static String decode(byte[] utf8) throws CharacterCodingException { 366 return decode(ByteBuffer.wrap(utf8), true); 367 } 368 369 public static String decode(byte[] utf8, int start, int length) 370 throws CharacterCodingException { 371 return decode(ByteBuffer.wrap(utf8, start, length), true); 372 } 373 374 /** 375 * Converts the provided byte array to a String using the 376 * UTF-8 encoding. If <code>replace</code> is true, then 377 * malformed input is replaced with the 378 * substitution character, which is U+FFFD. Otherwise the 379 * method throws a MalformedInputException. 380 */ 381 public static String decode(byte[] utf8, int start, int length, boolean replace) 382 throws CharacterCodingException { 383 return decode(ByteBuffer.wrap(utf8, start, length), replace); 384 } 385 386 private static String decode(ByteBuffer utf8, boolean replace) 387 throws CharacterCodingException { 388 CharsetDecoder decoder = DECODER_FACTORY.get(); 389 if (replace) { 390 decoder.onMalformedInput( 391 java.nio.charset.CodingErrorAction.REPLACE); 392 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 393 } 394 String str = decoder.decode(utf8).toString(); 395 // set decoder back to its default value: REPORT 396 if (replace) { 397 decoder.onMalformedInput(CodingErrorAction.REPORT); 398 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 399 } 400 return str; 401 } 402 403 /** 404 * Converts the provided String to bytes using the 405 * UTF-8 encoding. If the input is malformed, 406 * invalid chars are replaced by a default value. 407 * @return ByteBuffer: bytes stores at ByteBuffer.array() 408 * and length is ByteBuffer.limit() 409 */ 410 411 public static ByteBuffer encode(String string) 412 throws CharacterCodingException { 413 return encode(string, true); 414 } 415 416 /** 417 * Converts the provided String to bytes using the 418 * UTF-8 encoding. If <code>replace</code> is true, then 419 * malformed input is replaced with the 420 * substitution character, which is U+FFFD. Otherwise the 421 * method throws a MalformedInputException. 422 * @return ByteBuffer: bytes stores at ByteBuffer.array() 423 * and length is ByteBuffer.limit() 424 */ 425 public static ByteBuffer encode(String string, boolean replace) 426 throws CharacterCodingException { 427 CharsetEncoder encoder = ENCODER_FACTORY.get(); 428 if (replace) { 429 encoder.onMalformedInput(CodingErrorAction.REPLACE); 430 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 431 } 432 ByteBuffer bytes = 433 encoder.encode(CharBuffer.wrap(string.toCharArray())); 434 if (replace) { 435 encoder.onMalformedInput(CodingErrorAction.REPORT); 436 encoder.onUnmappableCharacter(CodingErrorAction.REPORT); 437 } 438 return bytes; 439 } 440 441 static final public int DEFAULT_MAX_LEN = 1024 * 1024; 442 443 /** Read a UTF8 encoded string from in 444 */ 445 public static String readString(DataInput in) throws IOException { 446 int length = WritableUtils.readVInt(in); 447 byte [] bytes = new byte[length]; 448 in.readFully(bytes, 0, length); 449 return decode(bytes); 450 } 451 452 /** Read a UTF8 encoded string with a maximum size 453 */ 454 public static String readString(DataInput in, int maxLength) 455 throws IOException { 456 int length = WritableUtils.readVIntInRange(in, 0, maxLength); 457 byte [] bytes = new byte[length]; 458 in.readFully(bytes, 0, length); 459 return decode(bytes); 460 } 461 462 /** Write a UTF8 encoded string to out 463 */ 464 public static int writeString(DataOutput out, String s) throws IOException { 465 ByteBuffer bytes = encode(s); 466 int length = bytes.limit(); 467 WritableUtils.writeVInt(out, length); 468 out.write(bytes.array(), 0, length); 469 return length; 470 } 471 472 /** Write a UTF8 encoded string with a maximum size to out 473 */ 474 public static int writeString(DataOutput out, String s, int maxLength) 475 throws IOException { 476 ByteBuffer bytes = encode(s); 477 int length = bytes.limit(); 478 if (length > maxLength) { 479 throw new IOException("string was too long to write! Expected " + 480 "less than or equal to " + maxLength + " bytes, but got " + 481 length + " bytes."); 482 } 483 WritableUtils.writeVInt(out, length); 484 out.write(bytes.array(), 0, length); 485 return length; 486 } 487 488 ////// states for validateUTF8 489 490 private static final int LEAD_BYTE = 0; 491 492 private static final int TRAIL_BYTE_1 = 1; 493 494 private static final int TRAIL_BYTE = 2; 495 496 /** 497 * Check if a byte array contains valid utf-8 498 * @param utf8 byte array 499 * @throws MalformedInputException if the byte array contains invalid utf-8 500 */ 501 public static void validateUTF8(byte[] utf8) throws MalformedInputException { 502 validateUTF8(utf8, 0, utf8.length); 503 } 504 505 /** 506 * Check to see if a byte array is valid utf-8 507 * @param utf8 the array of bytes 508 * @param start the offset of the first byte in the array 509 * @param len the length of the byte sequence 510 * @throws MalformedInputException if the byte array contains invalid bytes 511 */ 512 public static void validateUTF8(byte[] utf8, int start, int len) 513 throws MalformedInputException { 514 int count = start; 515 int leadByte = 0; 516 int length = 0; 517 int state = LEAD_BYTE; 518 while (count < start+len) { 519 int aByte = ((int) utf8[count] & 0xFF); 520 521 switch (state) { 522 case LEAD_BYTE: 523 leadByte = aByte; 524 length = bytesFromUTF8[aByte]; 525 526 switch (length) { 527 case 0: // check for ASCII 528 if (leadByte > 0x7F) 529 throw new MalformedInputException(count); 530 break; 531 case 1: 532 if (leadByte < 0xC2 || leadByte > 0xDF) 533 throw new MalformedInputException(count); 534 state = TRAIL_BYTE_1; 535 break; 536 case 2: 537 if (leadByte < 0xE0 || leadByte > 0xEF) 538 throw new MalformedInputException(count); 539 state = TRAIL_BYTE_1; 540 break; 541 case 3: 542 if (leadByte < 0xF0 || leadByte > 0xF4) 543 throw new MalformedInputException(count); 544 state = TRAIL_BYTE_1; 545 break; 546 default: 547 // too long! Longest valid UTF-8 is 4 bytes (lead + three) 548 // or if < 0 we got a trail byte in the lead byte position 549 throw new MalformedInputException(count); 550 } // switch (length) 551 break; 552 553 case TRAIL_BYTE_1: 554 if (leadByte == 0xF0 && aByte < 0x90) 555 throw new MalformedInputException(count); 556 if (leadByte == 0xF4 && aByte > 0x8F) 557 throw new MalformedInputException(count); 558 if (leadByte == 0xE0 && aByte < 0xA0) 559 throw new MalformedInputException(count); 560 if (leadByte == 0xED && aByte > 0x9F) 561 throw new MalformedInputException(count); 562 // falls through to regular trail-byte test!! 563 case TRAIL_BYTE: 564 if (aByte < 0x80 || aByte > 0xBF) 565 throw new MalformedInputException(count); 566 if (--length == 0) { 567 state = LEAD_BYTE; 568 } else { 569 state = TRAIL_BYTE; 570 } 571 break; 572 } // switch (state) 573 count++; 574 } 575 } 576 577 /** 578 * Magic numbers for UTF-8. These are the number of bytes 579 * that <em>follow</em> a given lead byte. Trailing bytes 580 * have the value -1. The values 4 and 5 are presented in 581 * this table, even though valid UTF-8 cannot include the 582 * five and six byte sequences. 583 */ 584 static final int[] bytesFromUTF8 = 585 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 586 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 587 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 591 0, 0, 0, 0, 0, 0, 0, 592 // trail bytes 593 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 594 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 595 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 596 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 597 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 598 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 599 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; 600 601 /** 602 * Returns the next code point at the current position in 603 * the buffer. The buffer's position will be incremented. 604 * Any mark set on this buffer will be changed by this method! 605 */ 606 public static int bytesToCodePoint(ByteBuffer bytes) { 607 bytes.mark(); 608 byte b = bytes.get(); 609 bytes.reset(); 610 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; 611 if (extraBytesToRead < 0) return -1; // trailing byte! 612 int ch = 0; 613 614 switch (extraBytesToRead) { 615 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 616 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 617 case 3: ch += (bytes.get() & 0xFF); ch <<= 6; 618 case 2: ch += (bytes.get() & 0xFF); ch <<= 6; 619 case 1: ch += (bytes.get() & 0xFF); ch <<= 6; 620 case 0: ch += (bytes.get() & 0xFF); 621 } 622 ch -= offsetsFromUTF8[extraBytesToRead]; 623 624 return ch; 625 } 626 627 628 static final int offsetsFromUTF8[] = 629 { 0x00000000, 0x00003080, 630 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; 631 632 /** 633 * For the given string, returns the number of UTF-8 bytes 634 * required to encode the string. 635 * @param string text to encode 636 * @return number of UTF-8 bytes required to encode 637 */ 638 public static int utf8Length(String string) { 639 CharacterIterator iter = new StringCharacterIterator(string); 640 char ch = iter.first(); 641 int size = 0; 642 while (ch != CharacterIterator.DONE) { 643 if ((ch >= 0xD800) && (ch < 0xDC00)) { 644 // surrogate pair? 645 char trail = iter.next(); 646 if ((trail > 0xDBFF) && (trail < 0xE000)) { 647 // valid pair 648 size += 4; 649 } else { 650 // invalid pair 651 size += 3; 652 iter.previous(); // rewind one 653 } 654 } else if (ch < 0x80) { 655 size++; 656 } else if (ch < 0x800) { 657 size += 2; 658 } else { 659 // ch < 0x10000, that is, the largest char value 660 size += 3; 661 } 662 ch = iter.next(); 663 } 664 return size; 665 } 666 }