001/* Copyright (C) 2013 TU Dortmund 002 * This file is part of AutomataLib, http://www.automatalib.net/. 003 * 004 * AutomataLib is free software; you can redistribute it and/or 005 * modify it under the terms of the GNU Lesser General Public 006 * License version 3.0 as published by the Free Software Foundation. 007 * 008 * AutomataLib is distributed in the hope that it will be useful, 009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 011 * Lesser General Public License for more details. 012 * 013 * You should have received a copy of the GNU Lesser General Public 014 * License along with AutomataLib; if not, see 015 * http://www.gnu.de/documents/lgpl.en.html. 016 */ 017package net.automatalib.words; 018 019import java.io.IOException; 020import java.util.AbstractList; 021import java.util.Collections; 022import java.util.List; 023import java.util.Objects; 024 025import net.automatalib.commons.util.array.ArrayWritable; 026import net.automatalib.commons.util.strings.AbstractPrintable; 027import net.automatalib.commons.util.strings.StringUtil; 028 029/** 030 * A word is an ordered sequence of symbols. {@link Word}s are generally immutable, 031 * i.e., a single {@link Word} object will never change (unless symbol objects are modified, 032 * which is however highly discouraged). 033 * <p> 034 * This class provides the following static methods for creating words in the most common scenarios: 035 * <ul> 036 * <li> {@link #epsilon()} returns the empty word of length 0 037 * <li> {@link #fromLetter(Object)} turns a single letter into a word of length 1 038 * <li> {@link #fromSymbols(Object...)} creates a word from an array of symbols 039 * <li> {@link #fromArray(Object[], int, int)} creates a word from a subrange of a symbols array 040 * <li> {@link #fromList(List)} creates a word from a {@link List} of symbols 041 * </ul> 042 * <p> 043 * Modification operations like {@link #append(Object)} or {@link #concat(Word...)} create 044 * new objects, subsequently invoking these operations on the respective objects returned is 045 * therefore highly inefficient. If words need to be dynamically created, a {@link WordBuilder} 046 * should be used. 047 * <p> 048 * This is an abstract base class for word representations. Implementing classes only need to implement 049 * <ul> 050 * <li> {@link #getSymbol(int)} 051 * <li> {@link #length()} 052 * </ul> 053 * <p> 054 * However, for the sake of efficiency it is highly encouraged to overwrite the other methods 055 * as well, providing specialized realizations. 056 * 057 * @param <I> symbol class 058 * 059 * @author Malte Isberner <malte.isberner@gmail.com> 060 */ 061public abstract class Word<I> extends AbstractPrintable implements ArrayWritable<I>, Iterable<I> { 062 063 064 /** 065 * Retrieves the empty word. 066 * @return the empty word. 067 * @see Collections#emptyList() 068 */ 069 @SuppressWarnings("unchecked") 070 public static <I> Word<I> epsilon() { 071 return (Word<I>)(Word<?>)EmptyWord.INSTANCE; 072 } 073 074 /** 075 * Constructs a word from a single letter. 076 * @param letter the letter 077 * @return a word consisting of only this letter 078 */ 079 public static <I> Word<I> fromLetter(I letter) { 080 return new LetterWord<>(letter); 081 } 082 083 /** 084 * Creates a word from an array of symbols. 085 * @param symbols the symbol array 086 * @return a word containing the symbols in the specified array 087 */ 088 @SafeVarargs 089 public static <I> Word<I> fromSymbols(I ...symbols) { 090 if(symbols.length == 0) 091 return epsilon(); 092 if(symbols.length == 1) 093 return fromLetter(symbols[0]); 094 Object[] array = new Object[symbols.length]; 095 System.arraycopy(symbols, 0, array, 0, symbols.length); 096 return new SharedWord<>(symbols); 097 } 098 099 /** 100 * Creates a word from a subrange of an array of symbols. Note that to 101 * ensure immutability, internally a copy of the array is made. 102 * @param symbols the symbols array 103 * @param offset the starting index in the array 104 * @param length the length of the resulting word (from the starting index on) 105 * @return the word consisting of the symbols in the range 106 */ 107 public static <I> Word<I> fromArray(I[] symbols, int offset, int length) { 108 if(length == 0) 109 return epsilon(); 110 if(length == 1) 111 return fromLetter(symbols[offset]); 112 Object[] array = new Object[length]; 113 System.arraycopy(symbols, offset, array, 0, length); 114 return new SharedWord<>(symbols); 115 } 116 117 /** 118 * Creates a word from a list of symbols 119 * @param symbolList the list of symbols 120 * @return the resulting word 121 */ 122 public static <I> Word<I> fromList(List<? extends I> symbolList) { 123 int siz = symbolList.size(); 124 if(siz == 0) 125 return epsilon(); 126 if(siz == 1) 127 return Word.<I>fromLetter(symbolList.get(0)); 128 return new SharedWord<>(symbolList); 129 } 130 131 public static Word<Character> fromString(String str) { 132 int len = str.length(); 133 Character[] chars = new Character[str.length()]; 134 for(int i = 0; i < len; i++) 135 chars[i] = Character.valueOf(str.charAt(i)); 136 return new SharedWord<>(chars); 137 } 138 139 /* 140 * General word iterator 141 */ 142 private class Iterator implements java.util.Iterator<I> { 143 144 private int index = 0; 145 146 /* 147 * (non-Javadoc) 148 * @see java.util.Iterator#hasNext() 149 */ 150 @Override 151 public boolean hasNext() { 152 return (index < Word.this.length()); 153 } 154 155 /* 156 * (non-Javadoc) 157 * @see java.util.Iterator#next() 158 */ 159 @Override 160 public I next() { 161 return Word.this.getSymbol(index++); 162 } 163 164 /* 165 * (non-Javadoc) 166 * @see java.util.Iterator#remove() 167 */ 168 @Override 169 public void remove() { 170 throw new UnsupportedOperationException(); 171 } 172 } 173 174 /* 175 * Representing a word as a list. 176 */ 177 private class AsList extends AbstractList<I> { 178 179 /* 180 * (non-Javadoc) 181 * @see java.util.AbstractList#get(int) 182 */ 183 @Override 184 public I get(int index) { 185 return Word.this.getSymbol(index); 186 } 187 188 /* 189 * (non-Javadoc) 190 * @see java.util.AbstractCollection#size() 191 */ 192 @Override 193 public int size() { 194 return Word.this.length(); 195 } 196 197 /* 198 * (non-Javadoc) 199 * @see java.util.AbstractList#iterator() 200 */ 201 @Override 202 public java.util.Iterator<I> iterator() { 203 return Word.this.iterator(); 204 } 205 } 206 207 /** 208 * Return symbol that is at the specified position 209 * @param i the position 210 * @return symbol at position i, <tt>null</tt> if no such symbol exists 211 */ 212 public abstract I getSymbol(int index); 213 214 /** 215 * Retrieves the length of this word. 216 * @return the length of this word. 217 */ 218 public abstract int length(); 219 220 /* 221 * (non-Javadoc) 222 * @see java.lang.Object#hashCode() 223 */ 224 @Override 225 public int hashCode() { 226 int hash = 5; 227 for(I sym : this) { 228 hash *= 89; 229 hash += (sym != null) ? sym.hashCode() : 0; 230 } 231 return hash; 232 } 233 234 /* 235 * (non-Javadoc) 236 * @see java.lang.Object#equals(java.lang.Object) 237 */ 238 @Override 239 public boolean equals(Object other) { 240 if(this == other) 241 return true; 242 if(other == null) 243 return false; 244 if(!(other instanceof Word)) 245 return false; 246 Word<?> otherWord = (Word<?>)other; 247 int len = otherWord.length(); 248 if(len != length()) 249 return false; 250 java.util.Iterator<I> thisIt = iterator(); 251 java.util.Iterator<?> otherIt = otherWord.iterator(); 252 while(thisIt.hasNext()) { 253 I thisSym = thisIt.next(); 254 Object otherSym = otherIt.next(); 255 if(!Objects.equals(thisSym, otherSym)) 256 return false; 257 } 258 return true; 259 } 260 261 /* 262 * (non-Javadoc) 263 * @see net.automatalib.commons.util.strings.Printable#print(java.lang.Appendable) 264 */ 265 @Override 266 public void print(Appendable a) throws IOException { 267 StringUtil.appendIterable(a, this, " "); 268 } 269 270 /* 271 * (non-Javadoc) 272 * @see net.automatalib.commons.util.array.ArrayWritable#size() 273 */ 274 @Override 275 public final int size() { 276 return length(); 277 } 278 279 280 281 /** 282 * Retrieves a word representing the specified subrange of this word. 283 * As words are immutable, this function usually can be realized quite efficient 284 * (implementing classes should take care of this). 285 * 286 * @param fromIndex the first index, inclusive. 287 * @param toIndex the last index, exclusive. 288 * @return the word representing the specified subrange. 289 */ 290 public final Word<I> subWord(int fromIndex, int toIndex) { 291 if(fromIndex < 0 || toIndex < fromIndex || toIndex > length()) 292 throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ", " + toIndex + ")"); 293 294 return _subWord(fromIndex, toIndex); 295 } 296 297 /** 298 * Retrieves the subword of this word starting at the given index and extending 299 * until the end of this word. Calling this method is equivalent to calling 300 * <pre>w.subWord(fromIndex, w.length())</pre> 301 * @param fromIndex the first index, inclusive 302 * @return the word representing the specified subrange 303 */ 304 public final Word<I> subWord(int fromIndex) { 305 if(fromIndex <= 0) { 306 if(fromIndex == 0) 307 return this; 308 throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ",)"); 309 } 310 return _subWord(fromIndex, length()); 311 } 312 313 /** 314 * Internal subword operation implementation. In contrast to {@link #subWord(int, int)}, 315 * no range checks need to be performed. As this method is flagged as <tt>protected</tt>, 316 * implementations may rely on the specified indices being valid. 317 * @param fromIndex the first index, inclusive (guaranteed to be valid) 318 * @param toIndex the last index, exclusive (guaranteed to be valid) 319 * @return the word representing the specified subrange 320 */ 321 protected Word<I> _subWord(int fromIndex, int toIndex) { 322 int len = toIndex - fromIndex; 323 Object[] array = new Object[len]; 324 writeToArray(fromIndex, array, 0, len); 325 return new SharedWord<>(array); 326 } 327 328 /* 329 * (non-Javadoc) 330 * @see java.lang.Iterable#iterator() 331 */ 332 @Override 333 public java.util.Iterator<I> iterator() { 334 return new Iterator(); 335 } 336 337 /* 338 * (non-Javadoc) 339 * @see net.automatalib.commons.util.array.ArrayWritable#writeToArray(int, java.lang.Object[], int, int) 340 */ 341 @Override 342 public void writeToArray(int offset, Object[] array, int tgtOffset, int length) { 343 int idx = offset, arrayIdx = tgtOffset; 344 345 while(length-- > 0) 346 array[arrayIdx++] = getSymbol(idx++); 347 } 348 349 350 /** 351 * Retrieves a {@link List} view on the contents of this word. 352 * @return an unmodifiable list of the contained symbols. 353 */ 354 public List<I> asList() { 355 return new AsList(); 356 } 357 358 /** 359 * Retrieves a prefix of the given length. If <code>length</code> 360 * is negative, then a prefix consisting of all but the last 361 * <code>-length</code> symbols is returned. 362 * 363 * @param prefixLen the length of the prefix (may be negative, see above). 364 * @return the prefix of the given length. 365 */ 366 public final Word<I> prefix(int prefixLen) { 367 if(prefixLen < 0) 368 prefixLen = length() + prefixLen; 369 370 return subWord(0, prefixLen); 371 } 372 373 /** 374 * Retrieves a suffix of the given length. If <code>length</code> is 375 * negative, then a suffix consisting of all but the first 376 * <code>-length</code> symbols is returned. 377 * 378 * @param suffixLen the length of the suffix (may be negative, see above). 379 * @return the suffix of the given length. 380 */ 381 public final Word<I> suffix(int suffixLen) { 382 int wordLen = length(); 383 int startIdx = (suffixLen < 0) ? -suffixLen : (wordLen - suffixLen); 384 385 return subWord(startIdx, wordLen); 386 } 387 388 /** 389 * Retrieves the list of all prefixes of this word. In the default implementation, 390 * the prefixes are lazily instantiated upon the respective calls of {@link List#get(int)} 391 * or {@link Iterator#next()}. 392 * @param longestFirst whether to start with the longest prefix (otherwise, the first prefix 393 * in the list will be the shortest). 394 * @return a (non-materialized) list containing all prefixes 395 */ 396 public List<Word<I>> prefixes(boolean longestFirst) { 397 return new SubwordList<>(this, true, longestFirst); 398 } 399 400 /** 401 * Retrieves the list of all suffixes of this word. In the default implementation, 402 * the suffixes are lazily instantiated upon the respective calls of {@link List#get(int)} 403 * or {@link Iterator#next()}. 404 * @param longestFirst whether to start with the longest suffix (otherwise, the first suffix 405 * in the list will be the shortest). 406 * @return a (non-materialized) list containing all suffix 407 */ 408 public List<Word<I>> suffixes(boolean longestFirst) { 409 return new SubwordList<>(this, false, longestFirst); 410 } 411 412 413 /** 414 * Retrieves the next word after this in canonical order. Figuratively 415 * speaking, if there are <tt>k</tt> alphabet symbols, one can think of 416 * a word of length <tt>n</tt> as an <tt>n</tt>-digit radix-<tt>k</tt> 417 * representation of the number. The next word in canonical order 418 * is the representation for the number represented by this word plus one. 419 * @param sigma the alphabet 420 * @return the next word in canonical order 421 */ 422 public Word<I> canonicalNext(Alphabet<I> sigma) { 423 int len = length(); 424 Object[] symbols = new Object[len]; 425 writeToArray(0, symbols, 0, len); 426 427 int alphabetSize = sigma.size(); 428 429 int i = 0; 430 boolean overflow = true; 431 for(I sym : this) { 432 int nextIdx = (sigma.getSymbolIndex(sym) + 1) % alphabetSize; 433 symbols[i++] = sigma.getSymbol(nextIdx); 434 if(nextIdx != 0) { 435 overflow = false; 436 break; 437 } 438 } 439 440 while(i < len) { 441 symbols[i] = getSymbol(i); 442 i++; 443 } 444 445 if(overflow) { 446 Object[] newSymbols = new Object[len+1]; 447 newSymbols[0] = sigma.getSymbol(0); 448 System.arraycopy(symbols, 0, newSymbols, 1, len); 449 symbols = newSymbols; 450 } 451 452 return new SharedWord<>(symbols); 453 } 454 455 /** 456 * Retrieves the last symbol of this word. 457 * @return the last symbol of this word. 458 */ 459 public I lastSymbol() { 460 return getSymbol(length() - 1); 461 } 462 463 /** 464 * Retrieves the first symbol of this word. 465 * @return the first symbol of this word 466 */ 467 public I firstSymbol() { 468 return getSymbol(0); 469 } 470 471 /** 472 * Appends a symbol to this word and returns the result as a new word. 473 * @param symbol the symbol to append 474 * @return the word plus the given symbol 475 */ 476 public Word<I> append(I symbol) { 477 int len = length(); 478 Object[] array = new Object[len + 1]; 479 writeToArray(0, array, 0, len); 480 array[len] = symbol; 481 return new SharedWord<>(array); 482 } 483 484 /** 485 * Prepends a symbol to this word and returns the result as a new word. 486 * @param symbol the symbol to prepend 487 * @return the given symbol plus to word. 488 */ 489 public Word<I> prepend(I symbol) { 490 int len = length(); 491 Object[] array = new Object[len+1]; 492 array[0] = symbol; 493 writeToArray(0, array, 1, len); 494 495 return new SharedWord<>(array); 496 } 497 498 /** 499 * Concatenates this word with several other words and returns the result 500 * as a new word. 501 * 502 * Note that this method cannot be overridden. Implementing classes need to 503 * override the {@link #_concat(Word...)} method instead. 504 * 505 * @param words the words to concatenate with this word 506 * @return the result of the concatenation 507 * @see #_concat(Word...) 508 */ 509 @SafeVarargs 510 public final Word<I> concat(Word<I> ...words) { 511 return _concat(words); 512 } 513 514 /** 515 * Realizes the concatenation of this word with several other words. 516 * @param words the words to concatenate 517 * @return the results of the concatenation 518 */ 519 @SuppressWarnings("unchecked") 520 protected Word<I> _concat(Word<I> ...words) { 521 if(words.length == 0) 522 return this; 523 524 int len = length(); 525 526 int totalSize = len; 527 for(int i = 0; i < words.length; i++) 528 totalSize += words[i].length(); 529 530 Object[] array = new Object[totalSize]; 531 writeToArray(0, array, 0, len); 532 int currOfs = len; 533 for(int i = 0; i < words.length; i++) { 534 Word<I> w = words[i]; 535 int wLen = w.length(); 536 w.writeToArray(0, array, currOfs, wLen); 537 currOfs += wLen; 538 } 539 540 return new SharedWord<>(array); 541 } 542 543 /** 544 * Checks if this word is a prefix of another word. 545 * @param other the other word 546 * @return <tt>true</tt> if this word is a prefix of the other word, <tt>false</tt> 547 * otherwise. 548 */ 549 public boolean isPrefixOf(Word<I> other) { 550 int len = length(), otherLen = other.length(); 551 if(otherLen < len) 552 return false; 553 554 for(int i = 0; i < len; i++) { 555 I sym1 = getSymbol(i), sym2 = other.getSymbol(i); 556 557 if(!Objects.equals(sym1, sym2)) 558 return false; 559 } 560 return true; 561 } 562 563 /** 564 * Determines the longest common prefix of this word and another 565 * word. 566 * @param other the other word 567 * @return the longest common prefix of this word and the other word 568 */ 569 public Word<I> longestCommonPrefix(Word<I> other) { 570 int len = length(), otherLen = other.length(); 571 int maxIdx = (len < otherLen) ? len : otherLen; 572 573 int i = 0; 574 while(i < maxIdx) { 575 I sym1 = getSymbol(i), sym2 = getSymbol(i); 576 577 if(!Objects.equals(sym1, sym2)) 578 break; 579 i++; 580 } 581 582 return prefix(i); 583 } 584 585 /** 586 * Checks if this word is a suffix of another word 587 * @param other the other word 588 * @return <tt>true</tt> if this word is a suffix of the other word, <tt>false</tt> 589 * otherwise. 590 */ 591 public boolean isSuffixOf(Word<I> other) { 592 int len = length(), otherLen = other.length(); 593 if(otherLen < len) 594 return false; 595 596 int ofs = otherLen - len; 597 for(int i = 0; i < len; i++) { 598 I sym1 = getSymbol(i), sym2 = other.getSymbol(ofs + i); 599 if(!Objects.equals(sym1, sym2)) 600 return false; 601 } 602 return true; 603 } 604 605 /** 606 * Determines the longest common suffix of this word and another word. 607 * @param other the other word 608 * @return the longest common suffix 609 */ 610 public Word<I> longestCommonSuffix(Word<I> other) { 611 int len = length(), otherLen = other.length(); 612 int minLen = (len < otherLen) ? len : otherLen; 613 614 int idx1 = len, idx2 = otherLen; 615 int i = 0; 616 while(i < minLen) { 617 I sym1 = getSymbol(--idx1), sym2 = other.getSymbol(--idx2); 618 619 if(!Objects.equals(sym1, sym2)) 620 break; 621 622 i++; 623 } 624 625 return suffix(i); 626 } 627 628 /** 629 * Retrieves a "flattened" version of this word, i.e., without any hierarchical structure 630 * attached. 631 * This can be helpful if {@link Word} is subclassed to allow representing, e.g., a concatenation 632 * dynamically, but due to performance concerns not too many levels of indirection 633 * should be introduced. 634 * @return a flattened version of this word. 635 */ 636 public Word<I> flatten() { 637 int len = length(); 638 Object[] array = new Object[len]; 639 writeToArray(0, array, 0, len); 640 return new SharedWord<>(array); 641 } 642 643 public Word<I> trimmed() { 644 int len = length(); 645 Object[] array = new Object[len]; 646 writeToArray(0, array, 0, len); 647 return new SharedWord<>(array); 648 } 649 650 /** 651 * Checks if this word is empty, i.e., contains no symbols. 652 * @return <tt>true</tt> if this word is empty, <tt>false</tt> otherwise. 653 */ 654 public boolean isEmpty() { 655 return (length() == 0); 656 } 657}