001 /* 002 * CRIMSON 003 * Copyright (c) 2006, Stephen Fisher, Susan Davidson, and Junhyong Kim, 004 * University of Pennsylvania. 005 * 006 * This program is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU General Public License as 008 * published by the Free Software Foundation; either version 2 of the 009 * License, or (at your option) any later version. 010 * 011 * This program is distributed in the hope that it will be useful, but 012 * WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 019 * 02110-1301 USA. 020 * 021 * @(#)Tree.java 022 */ 023 024 package edu.upenn.crimson; 025 026 import edu.upenn.crimson.io.*; 027 import java.util.HashSet; 028 import java.util.HashMap; 029 import java.util.ArrayList; 030 import java.util.TreeSet; 031 import java.util.Queue; 032 import java.util.LinkedList; 033 import java.util.Iterator; 034 import java.sql.SQLException; 035 036 /** 037 * A rooted tree that is composed of Species objects. 038 * 039 * Species must be added to the tree using addSpecies() and similarly 040 * removed from the tree with removeSpecies(). 041 * 042 * @XXX Should encapulate Species.add/removeChildren() in order to 043 * maintain a HshSet of all leaves/species. This would optimize 044 * numSpecies/numLeaves and would allow for the temporary storage of 045 * min/max values (ie level and temporal depth). These values could be 046 * stored when first computed and zeroed out when a species is 047 * added/removed from the tree. Similarly, min/max Stem Lengths 048 * could be pre-computed. Would need to also encapsulate setting of 049 * stem length and parent. 050 * 051 * @author Stephen Fisher 052 * @version $Id: Tree.java,v 1.61 2009/07/21 01:19:12 fisher Exp $ 053 */ 054 055 public class Tree implements Cloneable { 056 /** 057 * If true, then whenever a tree is sampled, the list of species 058 * selected will be output to the file 'species.list'. 059 */ 060 final static boolean DEBUG_OUTPUT = false; 061 062 /** 063 * Used in isUltrametric() to deal with machine error. If 064 * (maxTempDepth() - minTempDepth() < MaxMinDIFF), consider it to 065 * be an ultrametric tree. 066 */ 067 // final static double MaxMinDIFF = 0.00001; 068 069 /** 070 * This is a unique name for the Tree object. We don't allow the 071 * ID to change after a tree is created. 072 */ 073 private String id = ""; 074 075 /** Model specific to this Tree object. */ 076 private String modelID = ""; 077 078 /** Notes specific to this Tree object. */ 079 private String notes = ""; 080 081 /** The root Species of the tree. */ 082 private Species root = null; 083 084 /** Array containing references to all species in the tree. */ 085 private ArrayList speciesList = new ArrayList(); 086 087 /** 088 * The number of species is equivalent to speciesList.size(). It 089 * is stored separately, in case the tree has not been built from 090 * the newick string. 091 */ 092 private int numSpecies = -1; 093 094 /** 095 * The number of leaves is stored, in case the tree has not been 096 * built from the newick string. 097 */ 098 private int numLeaves = -1; 099 100 /** True when all inner nodes have exactly 2 children. */ 101 private boolean binary = false; 102 103 /** 104 * True when all leaves have the same temporal depth: 105 * (minTempDepth = maxTempDepth) 106 */ 107 private boolean ultrametric = false; 108 109 /** 110 * The minimum number of species between the leaves and root. 111 * This is inclusive of the root. Thus a leaf that is a child of 112 * the root would have a level of 1. 113 */ 114 private int minLevel = -1; 115 116 /** 117 * The maximum number of species between the leaves and root. 118 * This is inclusive of the root. Thus a leaf that is a child of 119 * the root would have a level of 1. 120 */ 121 private int maxLevel = -1; 122 123 /** The minimum distance between the species and their parents. */ 124 private double minStemLength = -1.0; 125 126 /** The maximum distance between the species and their parents. */ 127 private double maxStemLength = -1.0; 128 129 /** The minimum distance between the leaves and root. */ 130 private double minTempDepth = -1.0; 131 132 /** The maximum distance between the leaves and root. */ 133 private double maxTempDepth = -1.0; 134 135 /** This contains the partitions loaded for this tree. */ 136 private HashSet partitions = new HashSet(); 137 138 public Tree(String id) throws InvalidIDException { 139 this(id, true); 140 } 141 142 public Tree(String id, boolean addToPool) throws InvalidIDException { 143 // if no ID, then error 144 if (CrimsonUtils.isEmpty(id)) { 145 CrimsonUtils.printError("Invalid tree ID."); 146 throw new InvalidIDException("Tree ID can not be empty."); 147 } else if (ObjectHandles.containsTree(id)) { 148 CrimsonUtils.printError("Tree ID already exists, much specify a different ID:" + id); 149 throw new InvalidIDException("Tree ID already exists."); 150 } 151 this.id = id.trim().toUpperCase(); 152 153 // add the tree to the relevant lists 154 if (addToPool) ObjectHandles.addTree(this); 155 } 156 157 /** 158 * Trees should not be instantiated by the user, using this 159 * routine. It is used to load trees from the database. 160 */ 161 public Tree(String id, Species root, Model model, String notes, int numSpecies, int numLeaves, 162 boolean binary, boolean ultrametric, int minLevel, int maxLevel, 163 double minTempDepth, double maxTempDepth, double minStemLength, double maxStemLength) 164 throws InvalidIDException { 165 // if no ID, then error 166 if (CrimsonUtils.isEmpty(id)) { 167 CrimsonUtils.printError("Invalid tree ID."); 168 throw new InvalidIDException("Tree ID can not be empty."); 169 } else if (ObjectHandles.containsTree(id)) { 170 CrimsonUtils.printError("Tree ID already exists, much specify a different ID:" + id); 171 throw new InvalidIDException("Tree ID already exists."); 172 } 173 174 this.id = id.trim().toUpperCase(); 175 this.root = root; 176 this.notes = notes; 177 178 // the user should not be setting these values. These are set 179 // by computeStats(). This routine exists so that Trees can 180 // set this when the tree is loaded from the database. 181 this.numSpecies = numSpecies; 182 this.numLeaves = numLeaves; 183 this.binary = binary; 184 this.ultrametric = ultrametric; 185 this.minLevel = minLevel; 186 this.maxLevel = maxLevel; 187 this.minTempDepth = minTempDepth; 188 this.maxTempDepth = maxTempDepth; 189 this.minStemLength = minStemLength; 190 this.maxStemLength = maxStemLength; 191 192 // we require a model object rather than an ID, to make sure 193 // that the model really exists. 194 if (model != null) this.modelID = model.getID(); 195 196 // add the tree to the relevant lists 197 ObjectHandles.addTree(this); 198 } 199 200 //-------------------------------------------------------------------------- 201 // Setters and Getters 202 203 /** Get the ID. */ 204 public String getID() { return id; } 205 206 /** Returns the ID for the model underlying this tree. */ 207 public String getModelID() { return modelID; } 208 209 /** 210 * Sets the ID for the model underlying this tree. We require a 211 * model object rather than an ID, to make sure that the model 212 * really exists. 213 */ 214 public void setModelID(Model model) { 215 if (model == null) return; 216 217 this.modelID = model.getID(); 218 219 // update the TREES database 220 if (! Trees.dbContains(id)) { 221 CrimsonUtils.printError("Tree (" + id + ") not in database, tree model ID not saved to database."); 222 return; 223 } 224 225 // UPDATE trees SET model_id = modelID WHERE id = id 226 String sql = "UPDATE trees SET model_id = '" + modelID.toUpperCase() + "' WHERE id = '" + id.toUpperCase() + "'"; 227 if (! Database.execUpdate(sql)) 228 CrimsonUtils.printError("Error updating model_id in tree " + id + "."); 229 } 230 231 public String getNotes() { return this.notes; } 232 233 /** Updates the local notes field and the TREES table. */ 234 public void setNotes(String notes) { 235 this.notes = notes; 236 237 if (! Trees.dbContains(id)) { 238 CrimsonUtils.printError("Tree (" + id + ") not in database, notes not saved to database."); 239 return; 240 } 241 242 // UPDATE trees SET notes = notes WHERE id = id 243 String sql = "UPDATE trees SET notes = '" + notes + "' WHERE id = '" + id.toUpperCase() + "'"; 244 if (! Database.execUpdate(sql)) 245 CrimsonUtils.printError("Error updating notes in " + id + "."); 246 } 247 248 public Species getRoot() { return root; } 249 250 /** 251 * The root's label will be set to "_Root" if there is currently 252 * no label. The root will also be added to the species list. 253 */ 254 public void setRoot(Species root) { 255 if ((root != null) && CrimsonUtils.isEmpty(root.getID())) root.setID("_Root"); 256 this.root = root; 257 } 258 259 /** Returns the list of all species in the tree. */ 260 public ArrayList getSpeciesList() { return speciesList; } 261 262 /** This will return the set of partitions. */ 263 public HashSet getPartitions() { return partitions; } 264 265 /** This will return the number of species in the tree. */ 266 public int getNumSpecies() { return numSpecies; } 267 268 /** This will return the number of leaves in the tree. */ 269 public int getNumLeaves() { return numLeaves; } 270 271 public int getMinLevel() { return minLevel; } 272 273 public boolean isBinary() { return binary; } 274 275 public boolean isUltrametric() { return ultrametric; } 276 277 public int getMaxLevel() { return maxLevel; } 278 279 public double getMinStemLength() { return minStemLength; } 280 281 public double getMaxStemLength() { return maxStemLength; } 282 283 public double getMinTempDepth() { return minTempDepth; } 284 285 public double getMaxTempDepth() { return maxTempDepth; } 286 287 //-------------------------------------------------------------------------- 288 // Miscellaneous Methods 289 290 /** 291 * Returns the newick string for this tree, as stored in the 292 * database. This is retrieved from the database as needed to 293 * conserve RAM, instead of storing it as a string in the Tree 294 * object. 295 */ 296 public String getNewick() { 297 String sql = "SELECT newick FROM TREES WHERE id = '" + id.toUpperCase() + "'"; 298 ArrayList out = Database.sqlToArray(sql); 299 300 // if the query failed then exit 301 if (CrimsonUtils.isEmpty(out)) { 302 // tree not in database, so try and compute newick 303 String newick = toNewick(); 304 if (CrimsonUtils.isEmpty(newick)) { 305 CrimsonUtils.printError("Tree (" + id + ") not in database and unable to compute newick string."); 306 return ""; 307 } else { 308 return newick; 309 } 310 } 311 312 try { 313 String newick = Database.readClob(out.get(0)); 314 if (CrimsonUtils.isEmpty(newick)) { 315 // no newick string in the database, so try to compute 316 // it from the tree 317 newick = toNewick(); 318 319 if (CrimsonUtils.isEmpty(newick)) { 320 CrimsonUtils.printError("No newick string stored in database and unable to compute newick string."); 321 return ""; 322 } 323 } 324 return newick; 325 } catch (SQLException e) { 326 CrimsonUtils.printError("SQL error reading Newick CLOB from TREES."); 327 CrimsonUtils.printError(e.getMessage()); 328 return ""; 329 } 330 } 331 332 /** This will update the newick string stored in the database. */ 333 public void setNewick(String newick) { 334 if (! Trees.dbContains(id)) { 335 CrimsonUtils.printError("Tree (" + id + ") not in database, newick not updated."); 336 return; 337 } 338 339 // UPDATE trees SET newick = newick WHERE id = id 340 String sql = "UPDATE trees SET newic = '" + newick + "' WHERE id = '" + id.toUpperCase() + "'"; 341 if (! Database.execUpdate(sql)) 342 CrimsonUtils.printError("Error updating newick in " + id + "."); 343 } 344 345 /** Returns true if the tree has been built from the newick string. */ 346 public boolean isBuilt() { 347 if (root != null) return true; 348 else return false; 349 } 350 351 /** This will build the tree from the nexick string. */ 352 public boolean buildTree() { 353 if (isBuilt()) return true; 354 355 CrimsonUtils.printMsg("Building tree."); 356 NexusFile.parseNewick(this); 357 358 CrimsonUtils.printMsg("Parsed Newick."); 359 360 // if root not set, then can't build tree 361 if (root == null) return false; 362 363 // set the temporal depth and level for the tree. 364 Species species; 365 Queue queue = new LinkedList(); 366 367 // start with the root 368 queue.offer(root); 369 370 CrimsonUtils.printMsg("Computing tree stats."); 371 372 while ((species = (Species) queue.poll()) != null) { 373 species.computeLevel(); 374 species.computeTempDepth(); 375 376 // add children to the queue 377 queue = addChildrenToQueue(queue, species.getChildren()); 378 } 379 380 return true; 381 } 382 383 /** This will add the children to the queue. */ 384 private Queue addChildrenToQueue(Queue queue, Species[] children) { 385 if (children != null) { 386 for (int i = 0, n = children.length; i < n; i++) { 387 queue.offer(children[i]); 388 } 389 } 390 return queue; 391 } 392 393 /** 394 * This will release the tree structure from memory. This can be 395 * used to free up memory after large trees are built but the 396 * built structure is no longer needed. 397 */ 398 public void clearStructure() { 399 // remove links to the Species objects created 400 root = null; 401 speciesList.clear(); 402 403 // run the garbage collector to remove the Species objects 404 CrimsonMain.runGC(); 405 } 406 407 public void addSpecies(Species species) { 408 speciesList.add(species); 409 } 410 411 public void removeSpecies(Species species) { 412 speciesList.remove(species); 413 } 414 415 /** 416 * This will take a list of species IDs and return a list of 417 * species objects. 418 */ 419 public ArrayList getSpeciesByID(ArrayList speciesIn) { 420 ArrayList speciesOut = new ArrayList(); 421 if (! CrimsonUtils.isEmpty(speciesIn)) { 422 for (int i = 0; i < speciesIn.size(); i++) { 423 Species species = getSpeciesByID((String) speciesIn.get(i)); 424 if (species == null) { 425 CrimsonUtils.printWarning("Ignoring invalid species ID: " + speciesIn.get(i)); 426 } else { 427 speciesOut.add(species); 428 } 429 } 430 } 431 432 return speciesOut; 433 } 434 435 /** 436 * This steps through every species in the tree and compares the 437 * species IDs until it finds the species requested. 438 */ 439 public Species getSpeciesByID(String id) { 440 if (! CrimsonUtils.isEmpty(id)) { 441 for (int i = 0, n = speciesList.size(); i < n; i++) { 442 Species species = (Species) speciesList.get(i); 443 if (species.getID().equals(id)) return species; 444 } 445 } 446 447 return null; 448 } 449 450 /** This will compute the various tree statistics. */ 451 public void computeStats() { 452 // if root has a value then we've parsed the newick string 453 if (! buildTree()) { 454 CrimsonUtils.printError("Can't computer stats because no valid tree."); 455 return; 456 } 457 458 numSpecies = speciesList.size(); 459 numLeaves = 0; 460 461 minLevel = Integer.MAX_VALUE; 462 maxLevel = 0; 463 minStemLength = Double.MAX_VALUE; 464 maxStemLength = 0.0; 465 minTempDepth = Double.MAX_VALUE; 466 maxTempDepth = 0.0; 467 double minNonZeroSL = Double.MAX_VALUE; 468 for (int i = 0, n = numSpecies; i < n; i++) { 469 Species species = (Species) speciesList.get(i); 470 if (species.isLeaf()) { 471 numLeaves++; 472 if (species.getLevel() < minLevel) minLevel = species.getLevel(); 473 if (species.getLevel() > maxLevel) maxLevel = species.getLevel(); 474 if (species.getStemLength() < minStemLength) minStemLength = species.getStemLength(); 475 if (species.getStemLength() > maxStemLength) maxStemLength = species.getStemLength(); 476 if (species.getTempDepth() < minTempDepth) minTempDepth = species.getTempDepth(); 477 if (species.getTempDepth() > maxTempDepth) maxTempDepth = species.getTempDepth(); 478 if (species.getStemLength() < minNonZeroSL) { 479 if (species.getStemLength() > 0.0) minNonZeroSL = species.getStemLength(); 480 } 481 } 482 } 483 484 // compute binary 485 binary = computeBinary(); 486 487 // to compute ultrametric, we use the minimum non-zero stem 488 //length multiplied by 0.1 as the determination of the number 489 //of decimal places for comparison of the doubles. 490 ultrametric = (maxTempDepth - minTempDepth < (minNonZeroSL * 0.1)); 491 // ultrametric = (maxTempDepth - minTempDepth < MaxMinDIFF); 492 } 493 494 /** Returns true if the tree is binary. */ 495 public boolean computeBinary() { 496 if (! buildTree()) return false; 497 498 Species species; 499 Queue queue = new LinkedList(); 500 501 // start with the root 502 queue.offer(root); 503 504 while ((species = (Species) queue.poll()) != null) { 505 if (species.isBinary()) { 506 // we are binary, so add children to the queue 507 queue = addChildrenToQueue(queue, species.getChildren()); 508 } else { 509 // we have either 1 or more than 2 children, so not binary 510 return false; 511 } 512 } 513 514 return true; 515 } 516 517 /** Returns the set of all leaves in the tree. */ 518 public ArrayList getLeaves() { 519 if (! buildTree()) return null; 520 521 ArrayList leaves = new ArrayList(); 522 for (int i = 0, n = speciesList.size(); i < n; i++) { 523 Species species = (Species) speciesList.get(i); 524 if (species.isLeaf()) leaves.add(species); 525 } 526 527 return leaves; 528 } 529 530 /** This will duplicate the tree and all species. */ 531 public Object clone() { 532 Tree newTree = new Tree(id + "_" + Long.toString(Math.abs(CrimsonUtils.random.nextLong())), false); 533 534 Species species; 535 Species newSpecies; 536 Queue queue = new LinkedList(); 537 538 // start with the root 539 queue.offer(root); 540 541 while ((species = (Species) queue.poll()) != null) { 542 newSpecies = species.copy(newTree); 543 if (newTree.getRoot() == null) newTree.setRoot(newSpecies); 544 545 // add children to the queue 546 Species[] children = species.getChildren(); 547 if (children != null) { 548 for (int i = 0; i < children.length; i++) { 549 newSpecies.addChild(children[i]); 550 queue.offer(children[i]); 551 } 552 } 553 } 554 555 newTree.computeStats(); 556 return newTree; 557 } 558 559 /** 560 * This will print out this species and all children as a newick 561 * formatted tree. This is meant to be used for trees which are 562 * created by the application. When trees are loaded into the 563 * system from a NEXUS file, their newick structure is stored in 564 * the database and the method "newick()" can be used to reall the 565 * structure. Trees that are created by a query do not have 566 * newick structures and thus this routine can be used to create 567 * that structure. The inner sequence IDs will be included and the 568 * tree will be rooted. 569 * @return returns newick string 570 */ 571 public String toNewick() { return toNewick(true, true); } 572 573 /** 574 * This will print out this species and all children as a newick 575 * formatted tree. The tree will be rooted. This is meant to be 576 * used for trees which are created by the application. When 577 * trees are loaded into the system from a NEXUS file, their 578 * newick structure is stored in the database and the method 579 * "newick()" can be used to recall the structure. Trees that are 580 * created by a query do not have newick structures and thus this 581 * routine can be used to create that structure. 582 * @param incSequence when true, inner sequence IDs will be 583 * included in the newick output 584 * @return returns newick string 585 */ 586 public String toNewick(boolean incSequence) { return toNewick(incSequence, true); } 587 588 /** 589 * This will output this species and all children as a newick 590 * formatted tree. This is meant to be used for trees which are 591 * created by the application. When trees are loaded into the 592 * system from a NEXUS file, their newick structure is stored in 593 * the database and the method "newick()" can be used to recall 594 * the structure. Trees that are created by a query do not have 595 * newick structures and thus this routine can be used to create 596 * that structure. 597 * @param incSequence when true, inner sequence IDs will be 598 * included in the newick output 599 * @param rooted when true, the tree will rooted 600 * @return returns newick string 601 */ 602 public String toNewick(boolean incSequence, boolean rooted) { 603 // if root has a value then we've parsed the newick string 604 if (! buildTree()) { 605 CrimsonUtils.printError("Can't print newick because no valid tree."); 606 return ""; 607 } 608 609 // reset all children counters 610 for (int i = 0; i < numSpecies; i++) { 611 ((Species) speciesList.get(i)).resetChildIndex(); 612 } 613 614 Species species = root; 615 616 // use a StringBuffer sized to approximately fit the entire 617 // string to facilitate appending to the string. 618 StringBuffer out = new StringBuffer(numSpecies*10); 619 620 int fdbkVal = numSpecies / 10; 621 if (fdbkVal == 0) fdbkVal = 1; 622 int fdbkInc = 1; 623 CrimsonUtils.printMsg("Processing newick: .", false); 624 while (species != null) { 625 if ((fdbkInc++ % fdbkVal) == 0) CrimsonUtils.printMsg(".", false); 626 627 if (species.hasNextChild()) { 628 // process next child 629 if (species.firstChild()) out.append("("); 630 else out.append(","); 631 species = species.nextChild(); 632 633 } else { 634 // no more children so add species and then move up 635 // the tree 636 if (species.isLeaf()) out.append(species.getID() + ":" + species.getStemLength()); 637 else { 638 if (incSequence) out.append(")" + species.getID() + ":" + species.getStemLength()); 639 // else out.append("):" + species.getStemLength()); 640 else { 641 if (rooted) { 642 out.append("):" + species.getStemLength()); 643 } else { 644 // not rooted so only include branch 645 // length if species has a parent 646 if (species.getParent() != null) out.append("):" + species.getStemLength()); 647 else out.append(")"); 648 } 649 } 650 } 651 species = species.getParent(); 652 } 653 } 654 CrimsonUtils.printMsg(""); 655 return out.toString(); 656 } 657 658 /** 659 * This will print out the tree, one species per line, indenting 660 * each level. This is meant to be used as a debugging tool and 661 * is not meant to be used to print out large trees. 662 * @XXX This will give a stack overflow for deep trees. 663 */ 664 public String printTree() { 665 // if root has a value then we've parsed the newick string 666 if (! buildTree()) { 667 CrimsonUtils.printError("Can't print tree because no valid tree."); 668 return ""; 669 } 670 671 return root.printTree(""); 672 } 673 674 /** This will print lots of specs about the tree. */ 675 public String toString() { 676 String out = ""; 677 678 out += "Tree ID: " + id + "\n"; 679 out += "Num Leaves: " + numLeaves + "\n"; 680 out += "Num Species: " + numSpecies + "\n"; 681 out += "Ultrametric: " + isUltrametric() + "\n"; 682 out += "Binary: " + isBinary() + "\n"; 683 out += "Level: [" + minLevel + ", " + maxLevel + "]\n"; 684 out += "Stem Length: [" + minStemLength + ", " + maxStemLength + "]\n"; 685 out += "Temporal Depth: [" + minTempDepth + ", " + maxTempDepth + "]\n"; 686 687 if (partitions.size() > 0) { 688 out += "Partitions (" + partitions.size() + "): "; 689 for (Iterator i = partitions.iterator(); i.hasNext();) { 690 out += "\n\t" + (Partition) i.next(); 691 } 692 } else { 693 out += "Partitions: None\n"; 694 } 695 696 return out; 697 } 698 699 //-------------------------------------------------------------------------- 700 // Partition Related Methods 701 702 /** The number of partitions loaded for this tree. */ 703 public int getNumPartitions() { return partitions.size(); } 704 705 /** This is the sum of the length of all partitions. */ 706 public int getLength() { 707 int len = 0; 708 for (Iterator i = partitions.iterator(); i.hasNext();) { 709 len += ((Partition) i.next()).getLength(); 710 } 711 return len; 712 } 713 714 public void addPartition(Partition partition) { 715 partitions.add(partition); 716 } 717 718 public void removePartition(Partition partition) { 719 partitions.remove(partition); 720 } 721 722 /** 723 * This will return an ArrayList containing all of the partitions 724 * in the current tree that are based on the specified model. 725 */ 726 public ArrayList getPartitionsByModel(String modelID) { 727 ArrayList out = new ArrayList(); 728 729 for (Iterator i = partitions.iterator(); i.hasNext();) { 730 Partition partition = (Partition) i.next(); 731 if (partition.getModelID().equalsIgnoreCase(modelID)) { 732 out.add(partition); 733 } 734 } 735 736 return out; 737 } 738 739 //-------------------------------------------------------------------------- 740 // These methods will modify the tree or return subtrees 741 742 /** 743 * This will remove all internal species that have only one child, 744 * linking the child to species' parent and adjusting the child's 745 * stem lengths accordingly. 746 */ 747 public void collapse() { 748 for (Iterator i = speciesList.iterator(); i.hasNext();) { 749 // if collapsed then remove the species from speciesList 750 if (((Species) i.next()).collapse()) i.remove(); 751 } 752 } 753 754 /** 755 * This will return a list of leaf counts for each subtree 756 * specified by the subroots. This can be used in conjunction 757 * with getTreesByTempDepth() and getTreesByLevel() to get the 758 * leaf counts for subtrees below a threshold value. 759 */ 760 public ArrayList getLeafCounts(ArrayList subroots) { 761 if ((subroots == null) || (subroots.size() == 0)) return subroots; 762 763 ArrayList counts = new ArrayList(); 764 765 for (Iterator i = subroots.iterator(); i.hasNext();) { 766 // get the subtree for the subroot 767 Tree subtree = getSubtree((Species) i.next()); 768 769 // add count to output 770 counts.add(new Integer(subtree.getNumLeaves())); 771 } 772 773 return counts; 774 } 775 776 /** 777 * Returns the subtree given the specified root. This will not 778 * create a unique Tree but instead will contain the species 779 * objects from the original tree. The trees returned will not be 780 * added to the tree pool and are meant to be used as temporary 781 * trees. If a unique trees are needed, then use Tree.clone(). 782 */ 783 public Tree getSubtree(Species subroot) { 784 if (subroot == null) return null; 785 786 // these trees are not added to the treePool 787 Tree newTree = new Tree(id + "_" + subroot.getID(), false); 788 789 // set the root for the new tree 790 newTree.setRoot(subroot); 791 792 Species species; 793 Queue queue = new LinkedList(); 794 795 // start with this root 796 queue.offer(subroot); 797 798 while ((species = (Species) queue.poll()) != null) { 799 // register species with new tree 800 newTree.addSpecies(species); 801 802 // add children to the queue 803 addChildrenToQueue(queue, species.getChildren()); 804 } 805 806 // the subtree is already built - it inherits this from the 807 // parent. 808 newTree.computeStats(); 809 return newTree; 810 } 811 812 /** 813 * Returns the set of root species for any subtrees that are below 814 * a specified level. 815 */ 816 public ArrayList getTreesByLevel(int level) { 817 // list of roots of the subtrees 818 ArrayList roots = new ArrayList(); 819 820 // no trees 821 if (level > maxLevel) return roots; 822 823 Species species; 824 Queue queue = new LinkedList(); 825 826 // start with the root 827 queue.offer(root); 828 829 while ((species = (Species) queue.poll()) != null) { 830 if (species.getLevel() >= level) { 831 // add species as root. don't add it's children to 832 // the queue, if it's an inner node. 833 roots.add(species); 834 } else { 835 // the species isn't deep enought, so add any children 836 // to queue 837 queue = addChildrenToQueue(queue, species.getChildren()); 838 } 839 } 840 841 return roots; 842 } 843 844 /** 845 * Returns the random, uniform sampling of species from each 846 * subtree demarkated by the level value. The number of species 847 * included from each subtree will be 'num' or the total number of 848 * leaves in each subtree if a subtree has less than 'num' leaves. 849 */ 850 public Tree uniformSampleByLevel(int level, int num) { 851 if (! buildTree()) { 852 CrimsonUtils.printError("Can't sample tree because no valid tree."); 853 return null; 854 } 855 856 // get the list of roots for each subtree that matches 857 ArrayList subroots = getTreesByLevel(level); 858 859 if (subroots.size() == 0) { 860 CrimsonUtils.printError("Sampling returned no valid trees."); 861 return null; 862 } 863 864 // this will be the list of leaves that have been selected 865 // from the various subtrees 866 ArrayList leaves = new ArrayList(); 867 868 // compute the number of leaves to select, per subtree 869 int perSubtree = num / subroots.size(); 870 int remainder = num % subroots.size(); 871 872 // pick leaves 873 for (int i = 0, n = subroots.size(); i < n; i++) { 874 // randomly select a subtree (ie subroot) from the array 875 int index = CrimsonUtils.random.nextInt(subroots.size()); 876 877 // using the subroot, get the subtree 878 Tree subtree = getSubtree((Species) subroots.get(index)); 879 880 // remove the subtree from the array so we don't pick it again 881 subroots.remove(index); 882 883 int numLeaves = subtree.getNumLeaves(); 884 885 // if less leaves in subtree than being sampled, then we 886 // need to add the entire subtree, otherwise, we need to 887 // randomly sample the subtree. 888 if (numLeaves > perSubtree) { 889 int leavesNeeded; 890 if (remainder > 0) { 891 // spread remainder out over subtrees. if 892 // subtree.numLeaves is less than perSubtree + 1, 893 // then the remainder will be skipped. this will 894 // result in 1 less leaf selected. 895 leavesNeeded = perSubtree + 1; 896 remainder--; 897 } else { 898 // if more subtrees than 'num', we won't be 899 // sampling from every subtree 900 if (perSubtree > 0) { 901 leavesNeeded = perSubtree; 902 } else { 903 // if we get to this point then we'd fulfilled 904 // the sampling requirements, even though we 905 // haven't sampled from all the subtrees. 906 break; 907 } 908 } 909 ArrayList subLeaves = subtree.getLeaves(); 910 for (int l = 0; l < leavesNeeded; l++) { 911 // randomly select a leaf from the array 912 int iLeaf = CrimsonUtils.random.nextInt(subLeaves.size()); 913 914 // add leaf to output array 915 leaves.add(subLeaves.get(iLeaf)); 916 917 // remove the subtree from the array so we don't pick it again 918 subLeaves.remove(iLeaf); 919 } 920 } else { 921 // add all leaves 922 leaves.addAll(subtree.getLeaves()); 923 } 924 } 925 926 return manualSampleByID(leaves); 927 } 928 929 930 /** 931 * Returns the random, weighted sampling of species from each 932 * subtree demarkated by the level value. The number of leaves 933 * included from each subtree is a function of the total number of 934 * leaves requested and the relative number of leaves in each 935 * subtree. This is computed by pooling all leaves from all 936 * subtrees and then randomly selecting leaves from the pool. 937 */ 938 public Tree weightedSampleByLevel(int level, int num) { 939 if (! buildTree()) { 940 CrimsonUtils.printError("Can't sample tree because no valid tree."); 941 return null; 942 } 943 944 ArrayList subtrees = getTreesByLevel(level); 945 946 if (subtrees.size() == 0) { 947 CrimsonUtils.printError("Sampling returned no valid trees."); 948 return null; 949 } 950 951 // create leaf pool, the total set of all leaves from all 952 // subtrees 953 ArrayList leafPool = new ArrayList(); 954 for (Iterator i = subtrees.iterator(); i.hasNext();) { 955 // get the subtree for the subroot 956 Tree subtree = getSubtree((Species) i.next()); 957 958 // add leaves to pool 959 leafPool.addAll(subtree.getLeaves()); 960 } 961 962 // randomly sample from the leaf pool, creating the list of 963 // selected leaves 964 ArrayList leaves = new ArrayList(); 965 if (num >= leafPool.size()) { 966 // add all leaves 967 for (Iterator i = leafPool.iterator(); i.hasNext();) { 968 // add leaf IDs 969 leaves.add(i.next()); 970 } 971 } else { 972 for (int i = 0; i < num; i++) { 973 // randomly select a species from the leaf pool 974 int index = CrimsonUtils.random.nextInt(leafPool.size()); 975 Species species = (Species) leafPool.get(index); 976 977 // add the ID of the random species to the list 978 leaves.add(species); 979 980 // remove the random selection from the pool 981 leafPool.remove(index); 982 } 983 } 984 return manualSampleByID(leaves); 985 } 986 987 /** 988 * Returns the set of root species for any subtrees that are below 989 * a specified level. 990 */ 991 public ArrayList getTreesByTempDepth(double tempDepth) { 992 // list of roots of the subtrees 993 ArrayList roots = new ArrayList(); 994 995 // no trees 996 if (tempDepth > maxTempDepth) return roots; 997 998 Species species; 999 Queue queue = new LinkedList(); 1000 1001 // start with the root 1002 queue.offer(root); 1003 1004 while ((species = (Species) queue.poll()) != null) { 1005 if (species.getTempDepth() >= tempDepth) { 1006 // add species as root. don't add it's children to 1007 // the queue, if it's an inner node. 1008 roots.add(species); 1009 } else { 1010 // the species isn't deep enough, so add any children 1011 // to queue 1012 queue = addChildrenToQueue(queue, species.getChildren()); 1013 } 1014 } 1015 1016 return roots; 1017 } 1018 1019 /** 1020 * Returns the random, uniform sampling of species from each 1021 * subtree demarkated by the temporal depth value. The number of 1022 * species included from each subtree will be 'num' or the total 1023 * number of leaves in each subtree if a subtree has less than 1024 * 'num' leaves. 1025 */ 1026 public Tree uniformSampleByTempDepth(double tempDepth, int num) { 1027 if (! buildTree()) { 1028 CrimsonUtils.printError("Can't sample tree because no valid tree."); 1029 return null; 1030 } 1031 1032 // get the list of roots for each subtree that matches 1033 ArrayList subroots = getTreesByTempDepth(tempDepth); 1034 1035 if (subroots.size() == 0) { 1036 CrimsonUtils.printError("Sampling returned no valid trees."); 1037 return null; 1038 } 1039 1040 // this will be the list of leaves that have been selected 1041 // from the various subtrees 1042 ArrayList leaves = new ArrayList(); 1043 1044 // compute the number of leaves to select, per subtree 1045 int perSubtree = num / subroots.size(); 1046 int remainder = num % subroots.size(); 1047 1048 // pick leaves 1049 for (int i = 0, n = subroots.size(); i < n; i++) { 1050 // randomly select a subtree (ie subroot) from the array 1051 int index = CrimsonUtils.random.nextInt(subroots.size()); 1052 1053 // using the subroot, get the subtree 1054 Tree subtree = getSubtree((Species) subroots.get(index)); 1055 1056 // remove the subroot from the array so we don't pick it again 1057 subroots.remove(index); 1058 1059 int numLeaves = subtree.getNumLeaves(); 1060 1061 // if less leaves in subtree than being sampled, then we 1062 // need to add the entire subtree, otherwise, we need to 1063 // randomly sample the subtree. 1064 if (numLeaves > perSubtree) { 1065 int leavesNeeded; 1066 if (remainder > 0) { 1067 // spread remainder out over subtrees. if 1068 // subtree.numLeaves is less than perSubtree + 1, 1069 // then the remainder will be skipped. this will 1070 // result in 1 less leaf selected. 1071 leavesNeeded = perSubtree + 1; 1072 remainder--; 1073 } else { 1074 // if more subtrees than 'num', we won't be 1075 // sampling from every subtree 1076 if (perSubtree > 0) { 1077 leavesNeeded = perSubtree; 1078 } else { 1079 // if we get to this point then we'd fulfilled 1080 // the sampling requirements, even though we 1081 // haven't sampled from all the subtrees. 1082 break; 1083 } 1084 1085 } 1086 ArrayList subLeaves = subtree.getLeaves(); 1087 for (int l = 0; l < leavesNeeded; l++) { 1088 // randomly select a leaf from the array 1089 int iLeaf = CrimsonUtils.random.nextInt(subLeaves.size()); 1090 1091 // add leaf to output array 1092 leaves.add(subLeaves.get(iLeaf)); 1093 1094 // remove the subtree from the array so we don't pick it again 1095 subLeaves.remove(iLeaf); 1096 } 1097 } else { 1098 // add all leaves 1099 leaves.addAll(subtree.getLeaves()); 1100 } 1101 } 1102 1103 return manualSampleByID(leaves); 1104 } 1105 1106 1107 /** 1108 * Returns the random, weighted sampling of species from each 1109 * subtree demarkated by the temporal depth value. The number of 1110 * leaves included from each subtree is a function of the total 1111 * number of leaves requested and the relative number of leaves in 1112 * each subtree. This is computed by pooling all leaves from all 1113 * subtrees and then randomly selecting leaves from the pool. 1114 */ 1115 public Tree weightedSampleByTempDepth(double tempDepth, int num) { 1116 if (! buildTree()) { 1117 CrimsonUtils.printError("Can't sample tree because no valid tree."); 1118 return null; 1119 } 1120 1121 ArrayList subtrees = getTreesByTempDepth(tempDepth); 1122 1123 if (subtrees.size() == 0) { 1124 CrimsonUtils.printError("Sampling returned no valid trees."); 1125 return null; 1126 } 1127 1128 // create leaf pool, the total set of all leaves from all 1129 // subtrees 1130 ArrayList leafPool = new ArrayList(); 1131 for (Iterator i = subtrees.iterator(); i.hasNext();) { 1132 // get the subtree for the subroot 1133 Tree subtree = getSubtree((Species) i.next()); 1134 1135 // add leaves to pool 1136 leafPool.addAll(subtree.getLeaves()); 1137 } 1138 1139 // randomly sample from the leaf pool, creating the list of 1140 // selected leaves 1141 ArrayList leaves = new ArrayList(); 1142 if (num >= leafPool.size()) { 1143 // add all leaves 1144 for (Iterator i = leafPool.iterator(); i.hasNext();) { 1145 // add leaf IDs 1146 leaves.add(i.next()); 1147 } 1148 } else { 1149 for (int i = 0; i < num; i++) { 1150 // randomly select a species from the leaf pool 1151 int index = CrimsonUtils.random.nextInt(leafPool.size()); 1152 Species species = (Species) leafPool.get(index); 1153 1154 // add the ID of the random species to the list 1155 leaves.add(species); 1156 1157 // remove the random selection from the pool 1158 leafPool.remove(index); 1159 } 1160 } 1161 1162 return manualSampleByID(leaves); 1163 } 1164 1165 /** 1166 * This will return a subtree that contains the specified leaves. 1167 * The tree returned will contain species that are copies of those 1168 * in the original tree. The input for this method is an 1169 * ArrayList containing species IDs or species objects. It takes 1170 * a significant amount of time to convert from species IDs to 1171 * species objects. 1172 */ 1173 public Tree manualSampleByID(ArrayList oldLeaves) { 1174 if (! buildTree()) { 1175 CrimsonUtils.printError("Can't sample tree because no valid tree."); 1176 return null; 1177 } 1178 1179 if ((oldLeaves == null) || (oldLeaves.size() == 0)) return null; 1180 1181 // if we have a list of species IDs, then we need to convert 1182 // the IDs to objects before we begin. This is a slow 1183 // process. 1184 Object objTest = oldLeaves.get(0); 1185 if (objTest instanceof String) oldLeaves = getSpeciesByID(oldLeaves); 1186 1187 // give user some feedback 1188 CrimsonUtils.printMsg("ManualSampleByID: .", false); 1189 1190 // this will output a list of all leaves to the file 1191 // 'species.list' 1192 if (DEBUG_OUTPUT) FileIO.printSpecies(new TreeSet(oldLeaves), true); 1193 // if (DEBUG_OUTPUT) FileIO.printLeaves(new TreeSet(oldLeaves)); 1194 1195 Tree newTree = new Tree(id + "_" + Long.toString(Math.abs(CrimsonUtils.random.nextLong())), false); 1196 1197 // since the new tree contains copies of the species in the 1198 // old tree, we need to store a reference to original species 1199 // that is now the root of the new tree. 1200 Species oldRoot = null; 1201 1202 // we maintain a map of the species IDs and references to the 1203 // cloned species contained in the new tree. These will 1204 // facilitate adding new species. 1205 HashMap newSpeciesMap = new HashMap(); 1206 HashSet newSpeciesSet = new HashSet(); 1207 1208 // PROCESS FIRST SELECTION 1209 1210 // randomly select a leaf from the list 1211 Species oldLeaf = (Species) oldLeaves.get(0); 1212 1213 if (oldLeaf == null) { 1214 CrimsonUtils.printError("Invalid species ID: " + oldLeaf.getID()); 1215 return null; 1216 } 1217 1218 // we don't want to muck with the old leaf but instead, 1219 // will add a copy of the old leaf to the new tree. The 1220 // copy will have no children, parent, or level. It will 1221 // retain the original id and stem length. 1222 Species newLeaf = (Species) oldLeaf.copy(newTree); 1223 1224 // maintain a mapping between spcies IDs and references to the 1225 // species in the new tree (ie the clones). The contains() 1226 // method is much slower with a Map than it is with a Set, so 1227 // we maintain both a Map and a Set. The memory cost is less 1228 // significant than the temporal cost. 1229 1230 newSpeciesMap.put(newLeaf.getID(), newLeaf); 1231 newSpeciesSet.add(newLeaf); 1232 1233 // this is the first species, so add it as root 1234 newTree.setRoot(newLeaf); 1235 1236 // keep a reference to the original species within the 1237 // original tree. 1238 oldRoot = oldLeaf; 1239 1240 // for simplicity, store the number of leaves to be included 1241 int num = oldLeaves.size(); 1242 1243 // use this to figure out when to print a "." 1244 int inc = (num > 10) ? num / 10 : 1; 1245 1246 // PROCESS REMAINING SELECTIONS. Loop until all leaves have 1247 // been sampled 1248 for (int index = 1; index < num; index++) { 1249 // add "." every 10% completed 1250 if ((index % inc) == 0) CrimsonUtils.printMsg(".", false); 1251 1252 // get next leaf from the list 1253 oldLeaf = (Species) oldLeaves.get(index); 1254 1255 if (oldLeaf == null) { 1256 CrimsonUtils.printError("Invalid species ID: " + oldLeaf.getID()); 1257 return null; 1258 } 1259 1260 // we don't want to muck with the old leaf but instead, 1261 // will add a copy of the old leaf to the new tree. The 1262 // copy will have no children, parent, or level. It will 1263 // retain the original id and stem length. 1264 newLeaf = (Species) oldLeaf.copy(newTree); 1265 // maintain a mapping between spcies IDs and references to 1266 // the species in the new tree (ie the clones) 1267 newSpeciesMap.put(newLeaf.getID(), newLeaf); 1268 newSpeciesSet.add(newLeaf); 1269 1270 // since not the first leaf, we need to find the LCA 1271 // between the root of the new tree and the new leaf 1272 Species oldLCA = getLCA(oldRoot, oldLeaf); 1273 1274 if (oldLCA == oldRoot) { 1275 // loop until we've propogated up the tree from the 1276 // leaf to the LCA. 1277 while (true) { 1278 // get the parent 1279 Species oldParent = oldLeaf.getParent(); 1280 1281 if (newSpeciesSet.contains(oldParent)) { 1282 // we found the LCA, so now we need to add the 1283 // new child as a child of the LCA 1284 1285 // get a reference to the LCA in the new tree, 1286 // using the species ID 1287 Species newLCA = (Species) newSpeciesMap.get(oldParent.getID()); 1288 1289 // add the new child to the new LCA 1290 newLCA.addChild(newLeaf); 1291 1292 break; 1293 } else { 1294 // the inner node doesn't exist in the new tree, 1295 // so we will need to add it to the tree 1296 1297 // create a copy of the parent to be added to 1298 // the new tree 1299 Species newParent = (Species) oldParent.copy(newTree); 1300 1301 // add the cloned child to the cloned parent 1302 newParent.addChild(newLeaf); 1303 1304 // now we need to update the parents 1305 newLeaf = newParent; 1306 oldLeaf = oldParent; 1307 1308 // add the new parent to the newTree map 1309 newSpeciesMap.put(newParent.getID(), newParent); 1310 newSpeciesSet.add(newParent); 1311 } 1312 } 1313 1314 } else { 1315 // lca != nRoot: extend oldRoot and newLeaf to lca. 1316 1317 // need to extend the root of the new tree to the LCA, 1318 // via it's parents in the original tree. loop until 1319 // we've propogated up the tree from the NEW ROOT TO 1320 // THE LCA. 1321 while (true) { 1322 // get the parent of the new root, from the 1323 // original tree 1324 Species oldParent = oldRoot.getParent(); 1325 1326 // create a copy of the parent to be added to 1327 // the new tree 1328 Species newParent = (Species) oldParent.copy(newTree); 1329 1330 // add the cloned child to the cloned parent 1331 newParent.addChild(newTree.getRoot()); 1332 1333 // update root 1334 oldRoot = oldParent; 1335 newTree.setRoot(newParent); 1336 1337 // since we added a new node to the tree, we need 1338 // to update the newTree's HashMap 1339 newSpeciesMap.put(newParent.getID(), newParent); 1340 newSpeciesSet.add(newParent); 1341 1342 // if we found the LCA, then we're done, else 1343 // continue adding parents 1344 if (oldLCA == oldParent) break; 1345 } 1346 1347 // need to extend the new leaf via it's parents up to 1348 // the LCA. loop until we've propogated up the tree 1349 // from the NEW LEAF TO THE LCA. 1350 while (true) { 1351 // get the parent 1352 Species oldParent = oldLeaf.getParent(); 1353 1354 if (newSpeciesSet.contains(oldParent)) { 1355 // we found the LCA, so now we need to add the 1356 // new child as a child of the LCA 1357 1358 // get a reference to the LCA in the new tree, 1359 // using the species ID 1360 Species newLCA = (Species) newSpeciesMap.get(oldParent.getID()); 1361 1362 // add the new child to the new LCA 1363 newLCA.addChild(newLeaf); 1364 1365 break; 1366 } else { 1367 // the inner node doesn't exist in the new tree, 1368 // so we will need to add it to the tree 1369 1370 // create a copy of the parent to be added to 1371 // the new tree 1372 Species newParent = (Species) oldParent.copy(newTree); 1373 1374 // add the cloned child to the cloned parent 1375 newParent.addChild(newLeaf); 1376 1377 // now we can just need to deal with the 1378 // parents 1379 newLeaf = newParent; 1380 oldLeaf = oldParent; 1381 1382 // since we've now effectively added a new 1383 // node (newParent) to newTree, we need to 1384 // update the newTree's HashMap 1385 newSpeciesMap.put(newParent.getID(), newParent); 1386 newSpeciesSet.add(newParent); 1387 } 1388 } 1389 } 1390 } 1391 1392 // add CR-LF to end of "." 1393 CrimsonUtils.printMsg(""); 1394 1395 newTree.collapse(); 1396 newTree.computeStats(); 1397 return newTree; 1398 } 1399 1400 1401 /** 1402 * This will randomly sample the tree, returning a subtree that 1403 * contains 'num' leaves. The tree returned will contain species 1404 * that are copies of those in the original tree. 1405 */ 1406 public Tree randomSample(int num) { 1407 if (! buildTree()) { 1408 CrimsonUtils.printError("Can't sample tree because no valid tree."); 1409 return null; 1410 } 1411 1412 if (num < 1) return null; 1413 1414 // need the leaves as a list so we can more easily (randomly) 1415 // select leaves 1416 ArrayList oldLeaves = getLeaves(); 1417 1418 // make sure enough leaves to fullfil request 1419 if (oldLeaves.size() < num) { 1420 String msg = "Less leaves (" + String.valueOf(oldLeaves.size()) + ") "; 1421 msg += "in tree, than requested in tree sample (" + String.valueOf(num) + ")\n"; 1422 CrimsonUtils.printWarning(msg); 1423 1424 // since we're returning all leaves, we just need to clone the tree 1425 return (Tree) this.clone(); 1426 } 1427 1428 // give user some feedback 1429 CrimsonUtils.printMsg("RandomSample: .", false); 1430 1431 // Tree newTree = new Tree(id + "_" + String.valueOf(num)); 1432 Tree newTree = new Tree(id + "_" + Long.toString(Math.abs(CrimsonUtils.random.nextLong())), false); 1433 1434 // since the new tree contains copies of the species in the 1435 // old tree, we need to store a reference to original species 1436 // that is now the root of the new tree. 1437 Species oldRoot = null; 1438 1439 // we maintain a map of the species IDs and references to the 1440 // cloned species contained in the new tree. These will 1441 // facilitate adding new species. 1442 HashMap newSpeciesMap = new HashMap(); 1443 HashSet newSpeciesSet = new HashSet(); 1444 1445 // PROCESS FIRST SELECTION 1446 1447 // randomly select a leaf from the list 1448 int index = CrimsonUtils.random.nextInt(oldLeaves.size()); 1449 Species oldLeaf = (Species) oldLeaves.get(index); 1450 1451 // remove the leaf from the list of leaves. This way we 1452 // don't have to worry about selecting it again 1453 oldLeaves.remove(index); 1454 1455 // we don't want to muck with the old leaf but instead, 1456 // will add a copy of the old leaf to the new tree. The 1457 // copy will have no children, parent, or level. It will 1458 // retain the original id and stem length. 1459 Species newLeaf = (Species) oldLeaf.copy(newTree); 1460 1461 // maintain a mapping between spcies IDs and references to the 1462 // species in the new tree (ie the clones). The contains() 1463 // method is much slower with a Map than it is with a Set, so 1464 // we maintain both a Map and a Set. The memory cost is less 1465 // significant than the temporal cost. 1466 1467 newSpeciesMap.put(newLeaf.getID(), newLeaf); 1468 newSpeciesSet.add(newLeaf); 1469 1470 // this is the first species, so add it as root 1471 newTree.setRoot(newLeaf); 1472 1473 // keep a reference to the original species within the 1474 // original tree. 1475 oldRoot = oldLeaf; 1476 1477 // use this to figure out when to print a "." 1478 int inc = (num > 10) ? num / 10 : 1; 1479 1480 // PROCESS REMAINING SELECTIONS. Loop until all leaves have 1481 // been sampled 1482 for (int i = 1; i < num; i++) { 1483 // add "." every 10% completed 1484 if ((i % inc) == 0) CrimsonUtils.printMsg(".", false); 1485 1486 // randomly select a leaf from the list 1487 index = CrimsonUtils.random.nextInt(oldLeaves.size()); 1488 oldLeaf = (Species) oldLeaves.get(index); 1489 1490 // remove the leaf from the list of leaves. This way we 1491 // don't have to worry about selecting it again 1492 oldLeaves.remove(index); 1493 1494 // we don't want to muck with the old leaf but instead, 1495 // will add a copy of the old leaf to the new tree. The 1496 // copy will have no children, parent, or level. It will 1497 // retain the original id and stem length. 1498 newLeaf = (Species) oldLeaf.copy(newTree); 1499 // maintain a mapping between spcies IDs and references to 1500 // the species in the new tree (ie the clones) 1501 newSpeciesMap.put(newLeaf.getID(), newLeaf); 1502 newSpeciesSet.add(newLeaf); 1503 1504 // since not the first leaf, we need to find the LCA 1505 // between the root of the new tree and the new leaf 1506 Species oldLCA = getLCA(oldRoot, oldLeaf); 1507 1508 if (oldLCA == oldRoot) { 1509 // loop until we've propogated up the tree from the 1510 // leaf to the LCA. 1511 while (true) { 1512 // get the parent 1513 Species oldParent = oldLeaf.getParent(); 1514 1515 if (newSpeciesSet.contains(oldParent)) { 1516 // we found the LCA, so now we need to add the 1517 // new child as a child of the LCA 1518 1519 // get a reference to the LCA in the new tree, 1520 // using the species ID 1521 Species newLCA = (Species) newSpeciesMap.get(oldParent.getID()); 1522 1523 // add the new child to the new LCA 1524 newLCA.addChild(newLeaf); 1525 1526 break; 1527 } else { 1528 // the inner node doesn't exist in the new tree, 1529 // so we will need to add it to the tree 1530 1531 // create a copy of the parent to be added to 1532 // the new tree 1533 Species newParent = (Species) oldParent.copy(newTree); 1534 1535 // add the cloned child to the cloned parent 1536 newParent.addChild(newLeaf); 1537 1538 // now we need to update the parents 1539 newLeaf = newParent; 1540 oldLeaf = oldParent; 1541 1542 // add the new parent to the newTree map 1543 newSpeciesMap.put(newParent.getID(), newParent); 1544 newSpeciesSet.add(newParent); 1545 } 1546 } 1547 1548 } else { 1549 // lca != nRoot: extend oldRoot and newLeaf to lca. 1550 1551 // need to extend the root of the new tree to the LCA, 1552 // via it's parents in the original tree. loop until 1553 // we've propogated up the tree from the NEW ROOT TO 1554 // THE LCA. 1555 while (true) { 1556 // get the parent of the new root, from the 1557 // original tree 1558 Species oldParent = oldRoot.getParent(); 1559 1560 // create a copy of the parent to be added to 1561 // the new tree 1562 Species newParent = (Species) oldParent.copy(newTree); 1563 1564 // add the cloned child to the cloned parent 1565 newParent.addChild(newTree.getRoot()); 1566 1567 // update root 1568 oldRoot = oldParent; 1569 newTree.setRoot(newParent); 1570 1571 // since we added a new node to the tree, we need 1572 // to update the newTree's HashMap 1573 newSpeciesMap.put(newParent.getID(), newParent); 1574 newSpeciesSet.add(newParent); 1575 1576 // if we found the LCA, then we're done, else 1577 // continue adding parents 1578 if (oldLCA == oldParent) break; 1579 } 1580 1581 // need to extend the new leaf via it's parents up to 1582 // the LCA. loop until we've propogated up the tree 1583 // from the NEW LEAF TO THE LCA. 1584 while (true) { 1585 // get the parent 1586 Species oldParent = oldLeaf.getParent(); 1587 1588 if (newSpeciesSet.contains(oldParent)) { 1589 // we found the LCA, so now we need to add the 1590 // new child as a child of the LCA 1591 1592 // get a reference to the LCA in the new tree, 1593 // using the species ID 1594 Species newLCA = (Species) newSpeciesMap.get(oldParent.getID()); 1595 1596 // add the new child to the new LCA 1597 newLCA.addChild(newLeaf); 1598 1599 break; 1600 } else { 1601 // the inner node doesn't exist in the new tree, 1602 // so we will need to add it to the tree 1603 1604 // create a copy of the parent to be added to 1605 // the new tree 1606 Species newParent = (Species) oldParent.copy(newTree); 1607 1608 // add the cloned child to the cloned parent 1609 newParent.addChild(newLeaf); 1610 1611 // now we can just need to deal with the 1612 // parents 1613 newLeaf = newParent; 1614 oldLeaf = oldParent; 1615 1616 // since we've now effectively added a new 1617 // node (newParent) to newTree, we need to 1618 // update the newTree's HashMap 1619 newSpeciesMap.put(newParent.getID(), newParent); 1620 newSpeciesSet.add(newParent); 1621 } 1622 } 1623 } 1624 } 1625 1626 // add CR-LF to end of "." 1627 CrimsonUtils.printMsg(""); 1628 1629 newTree.collapse(); 1630 newTree.computeStats(); 1631 1632 // this will output a list of all leaves to the file 1633 // 'leaf.list' 1634 // if (DEBUG_OUTPUT) FileIO.printSpecies(new TreeSet(newSpeciesSet), true); 1635 if (DEBUG_OUTPUT) FileIO.printSpecies(newTree.getSpeciesList(), false); 1636 if (DEBUG_OUTPUT) FileIO.printSpecies(newSpeciesSet, true); 1637 1638 return newTree; 1639 } 1640 1641 /** 1642 * This will return the LCA for all of the leaves in the input 1643 * set. 1644 */ 1645 /* 1646 public Species getRootLCA(ArrayList leaves) { 1647 if (! buildTree()) { 1648 CrimsonUtils.printError("Can't sample tree because no valid tree."); 1649 return null; 1650 } 1651 1652 if ((leaves == null) || (leaves.size() == 0)) return null; 1653 1654 // this will output a list of all leaves to the file 1655 // 'species.list' 1656 if (DEBUG_OUTPUT) FileIO.printSpecies(new TreeSet(newSpeciesSet), true); 1657 // if (DEBUG_OUTPUT) FileIO.printLeaves(new TreeSet(leaves)); 1658 1659 // keep track of inner nodes processed 1660 HashSet speciesSet = new HashSet(); 1661 1662 // PROCESS FIRST SELECTION 1663 1664 // randomly select a leaf from the list 1665 String speciesID = (String) leaves.get(0); 1666 Species leaf = getSpeciesByID(speciesID); 1667 if (leaf == null) { 1668 CrimsonUtils.printError("Invalid species ID: " + speciesID); 1669 return null; 1670 } 1671 1672 // this is the first species, so add it as root. since it's a 1673 // leaf (and not an inner node) we don't need to add it to 1674 // speciesSet. 1675 Species root = leaf; 1676 1677 // for simplicity, store the number of leaves to be included 1678 int num = leaves.size(); 1679 1680 // use this to figure out when to print a "." 1681 int inc = (num > 10) ? num / 10 : 1; 1682 1683 // give user some feedback 1684 CrimsonUtils.printMsg("GetRootLCA: .", false); 1685 1686 // PROCESS REMAINING SELECTIONS. Loop until all leaves have 1687 // been sampled 1688 for (int index = 1; index < num; index++) { 1689 // add "." every 10% completed 1690 if ((index % inc) == 0) CrimsonUtils.printMsg(".", false); 1691 1692 // get next leaf from the list 1693 speciesID = (String) leaves.get(index); 1694 leaf = getSpeciesByID(speciesID); 1695 if (leaf == null) { 1696 CrimsonUtils.printError("Invalid species ID: " + speciesID); 1697 return null; 1698 } 1699 1700 // since not the first leaf, we need to find the LCA 1701 // between the root of the new tree and the new leaf 1702 Species lca = getLCA(root, leaf); 1703 1704 if (lca == root) { 1705 // loop until we've propogated up the tree from the 1706 // leaf to the LCA. 1707 while (true) { 1708 // get the parent 1709 Species parent = leaf.getParent(); 1710 1711 if (speciesSet.contains(parent)) { 1712 break; // found lca 1713 } else { 1714 // this is a new inner node, so add it to set 1715 speciesSet.add(parent); 1716 } 1717 } 1718 } else { 1719 // lca != root: extend root and leaf to lca. 1720 1721 // need to extend the root to the LCA, via it's 1722 // parents in the tree. loop until we've propogated 1723 // up the tree from the NEW ROOT TO THE LCA. 1724 while (true) { 1725 // get the parent of the new root, from the 1726 // original tree 1727 Species parent = root.getParent(); 1728 1729 // update root 1730 root = parent; 1731 1732 // this is a new inner node, so add it to set 1733 speciesSet.add(parent); 1734 1735 // if we found the LCA, then we're done, else 1736 // continue adding parents 1737 if (lca == parent) break; 1738 } 1739 1740 // need to extend the new leaf via it's parents up to 1741 // the LCA. loop until we've propogated up the tree 1742 // from the NEW LEAF TO THE LCA. 1743 while (true) { 1744 // get the parent 1745 Species parent = leaf.getParent(); 1746 1747 if (speciesSet.contains(parent)) { 1748 break; // found lca 1749 } else { 1750 // new inner node so add it to the set 1751 speciesSet.add(parent); 1752 } 1753 } 1754 } 1755 } 1756 1757 // add CR-LF to end of "." 1758 CrimsonUtils.printMsg(""); 1759 return root; 1760 } 1761 */ 1762 1763 /** 1764 * This will return the LCA for a random sampling of leaves from 1765 * the tree. 1766 */ 1767 public Species getRootLCA(int num) { 1768 if (! buildTree()) { 1769 CrimsonUtils.printError("Can't sample tree because no valid tree."); 1770 return null; 1771 } 1772 1773 if (num < 1) return null; 1774 1775 // need the leaves as a list so we can more easily (randomly) 1776 // select leaves 1777 ArrayList leaves = getLeaves(); 1778 1779 // make sure enough leaves to fullfil request 1780 if (leaves.size() < num) { 1781 String msg = "Less leaves (" + String.valueOf(leaves.size()) + ") "; 1782 msg += "in tree, than requested in tree sample (" + String.valueOf(num) + ")\n"; 1783 CrimsonUtils.printWarning(msg); 1784 1785 // since we're returning all leaves, we just need to clone the tree 1786 return null; 1787 } 1788 1789 // keep track of inner nodes proccessed 1790 HashSet speciesSet = new HashSet(); 1791 1792 // PROCESS FIRST SELECTION 1793 1794 // randomly select a leaf from the list 1795 int index = CrimsonUtils.random.nextInt(leaves.size()); 1796 Species leaf = (Species) leaves.get(index); 1797 1798 // this is the first species, so add it as root. since it's a 1799 // leaf (and not an inner node) we don't need to add it to 1800 // speciesSet. 1801 Species root = leaf; 1802 1803 // remove the leaf from the list of leaves. This way we 1804 // don't have to worry about selecting it again 1805 leaves.remove(index); 1806 1807 // use this to figure out when to print a "." 1808 int inc = (num > 10) ? num / 10 : 1; 1809 1810 // give user some feedback 1811 CrimsonUtils.printMsg("GetRootLCA: .", false); 1812 1813 // PROCESS REMAINING SELECTIONS. Loop until all leaves have 1814 // been sampled 1815 for (int i = 1; i < num; i++) { 1816 // add "." every 10% completed 1817 if ((i % inc) == 0) CrimsonUtils.printMsg(".", false); 1818 1819 // randomly select a leaf from the list 1820 index = CrimsonUtils.random.nextInt(leaves.size()); 1821 leaf = (Species) leaves.get(index); 1822 1823 // remove the leaf from the list of leaves. This way we 1824 // don't have to worry about selecting it again 1825 leaves.remove(index); 1826 1827 // this is a new inner node, so we will need to add it to 1828 // our set 1829 speciesSet.add(leaf); 1830 1831 // since not the first leaf, we need to find the LCA 1832 // between the root of the new tree and the new leaf 1833 Species lca = getLCA(root, leaf); 1834 1835 if (lca == root) { 1836 // loop until we've propogated up the tree from the 1837 // leaf to the LCA. 1838 while (true) { 1839 // get the parent 1840 Species parent = leaf.getParent(); 1841 1842 if (speciesSet.contains(parent)) { 1843 break; // we found the LCA 1844 } else { 1845 // this is a new inner node, so add it to set 1846 speciesSet.add(parent); 1847 } 1848 } 1849 1850 } else { 1851 // lca != nRoot: extend root and newLeaf to lca. 1852 1853 // need to extend the root of the new tree to the LCA, 1854 // via it's parents in the original tree. loop until 1855 // we've propogated up the tree from the NEW ROOT TO 1856 // THE LCA. 1857 while (true) { 1858 // get the parent of the new root, from the 1859 // original tree 1860 Species parent = root.getParent(); 1861 1862 // update root 1863 root = parent; 1864 1865 // this is a new inner node, so add it to set 1866 speciesSet.add(parent); 1867 1868 // if we found the LCA, then we're done, else 1869 // continue adding parents 1870 if (lca == parent) break; 1871 } 1872 1873 // need to extend the new leaf via it's parents up to 1874 // the LCA. loop until we've propogated up the tree 1875 // from the NEW LEAF TO THE LCA. 1876 while (true) { 1877 // get the parent 1878 Species parent = leaf.getParent(); 1879 1880 if (speciesSet.contains(parent)) { 1881 break; // found lca 1882 } else { 1883 // new inner node so add it to the set 1884 speciesSet.add(parent); 1885 } 1886 } 1887 } 1888 } 1889 1890 // add CR-LF to end of "." 1891 CrimsonUtils.printMsg(""); 1892 1893 // this will output a list of all leaves to the file 1894 // 'leaf.list' 1895 if (DEBUG_OUTPUT) FileIO.printSpecies(new TreeSet(speciesSet), true); 1896 1897 return root; 1898 } 1899 1900 //-------------------------------------------------------------------------- 1901 // Non-specific Tree Related Functions. 1902 1903 /** 1904 * Returns the LCA for the tree, assuming this species is the root 1905 * of the tree. This requires that the level values have been set. 1906 */ 1907 public static Species getLCA(Species s1, Species s2) { 1908 if (s1 == s2) return s1; 1909 if (s1.isRoot()) return s1; 1910 if (s2.isRoot()) return s2; 1911 1912 Species lca = null; 1913 Species des = null; 1914 if (s1.getLevel() < s2.getLevel()) { 1915 lca = s1; 1916 des = s2; 1917 } else if (s1.getLevel() > s2.getLevel()) { 1918 lca = s2; 1919 des = s1; 1920 } else { 1921 lca = s1.getParent(); 1922 des = s2; 1923 } 1924 1925 while (des.getLevel() - lca.getLevel() > 1) { 1926 des = des.getParent(); 1927 } 1928 1929 while ((! lca.isRoot()) && (des.getParent() != lca)) { 1930 // while ((! lca.isRoot()) && (! (des.getParent()).equals(lca))) { 1931 des = des.getParent(); 1932 if (des.getLevel() == lca.getLevel()) { 1933 lca = lca.getParent(); 1934 } 1935 } 1936 1937 return lca; 1938 } 1939 1940 } // Tree.java 1941