001 /* 002 * CRIMSON 003 * Copyright (c) 2006, Stephen Fisher, Susan Davidson, and Junhyong Kim, 004 * University of Pennsylvania. 005 * 006 * This program is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU General Public License as 008 * published by the Free Software Foundation; either version 2 of the 009 * License, or (at your option) any later version. 010 * 011 * This program is distributed in the hope that it will be useful, but 012 * WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 019 * 02110-1301 USA. 020 * 021 * @(#)Queries.java 022 */ 023 024 package edu.upenn.crimson.io; 025 026 import edu.upenn.crimson.*; 027 import java.util.HashSet; 028 import java.util.ArrayList; 029 import java.util.TreeSet; 030 import java.util.Iterator; 031 import java.util.Scanner; 032 import java.io.*; 033 import java.sql.Clob; 034 import java.sql.SQLException; 035 036 /** 037 * Functions related to QUERIES table. 038 * 039 * @XXX Should see if can optimize db access by maintaining one 040 * Statement that is created every time a database connection is made. 041 * 042 * @author Stephen Fisher 043 * @version $Id: Queries.java,v 1.66 2009/07/21 01:19:12 fisher Exp $ 044 */ 045 046 public class Queries { 047 /** 048 * If true, then whenever a (bp or codon) position selection query 049 * is run, the list of positions selected will be output to the 050 * file 'positions.list'. 051 */ 052 final static boolean DEBUG_OUTPUT = true; 053 054 /** 055 * When writing data to a NEXUS file ('run()'), this is the number 056 * of species queried per SQL statement. If more than 'SPECIES_CHUNK' 057 * number of species to query, than multiple queries will be 058 * performed, each one with 'SPECIES_CHUNK' number of species. 059 */ 060 final static int SPECIES_CHUNK = 50; 061 062 063 //-------------------------------------------------------------------------- 064 // Miscellaneous Methods 065 066 /** 067 * This will load the specified query from the QUERIES table in 068 * the current database. If no query is specified then all queries 069 * will be loaded. 070 */ 071 public static void load(String id) { 072 // make sure a database is open. We don't need to check for 073 // the existance of the QUERIES database because this was done 074 // when the database was opened. 075 if (! Database.isOpen()) { 076 CrimsonUtils.printError("Can't load query, no open database."); 077 return; 078 } 079 080 CrimsonUtils.printMsg("Loading query info..."); 081 082 // get the list of queries in the QUERIES table. 083 /* 084 String sql = "SELECT id, tree_id, notes, leaf_select, num_leaves, "; 085 sql += "temp_depth_thresh, level_thresh, seq_select, num_pos, seed "; 086 sql += " FROM QUERIES"; 087 */ 088 String sql = "SELECT id, tree_id, notes, leaf_select, num_leaves, "; 089 sql += "temp_depth_thresh, level_thresh, seq_select, num_pos, seed, "; 090 sql += "leaves, partitions, positions FROM QUERIES"; 091 // if ID exists, then only load that query 092 if (! CrimsonUtils.isEmpty(id)) sql += " WHERE id = '" + id.toUpperCase() + "'"; 093 sql = sql.toUpperCase(); 094 095 if (Database.isProxy()) { 096 StringBuilder results = Database.ImportQuery(sql); 097 098 // if the query failed then exit 099 if (results == null) { 100 CrimsonUtils.printError("Unable to build queries lists, error accessing QUERIES table."); 101 return; 102 } 103 104 Query.dbLoadProxy(results); 105 } else { 106 ArrayList results = Database.sqlToArray(sql); 107 108 // if the query failed then exit 109 if (results == null) { 110 CrimsonUtils.printError("Unable to build queries lists, error accessing QUERIES table."); 111 return; 112 } 113 114 // convert each row in the results to a Query 115 for (Iterator rows = results.iterator(); rows.hasNext();) { 116 Query.dbLoad((ArrayList) rows.next()); 117 } 118 } 119 120 /* 121 try { 122 // convert each row in the results to a Query 123 for (Iterator rows = results.iterator(); rows.hasNext();) { 124 ArrayList row = (ArrayList) rows.next(); 125 126 Query query = new Query(String.valueOf(row.get(1)), String.valueOf(row.get(0))); 127 query.setNotes(String.valueOf(row.get(2))); 128 query.setLeafSelection(Integer.parseInt(String.valueOf(row.get(3)))); 129 query.setNumLeaves(Integer.parseInt(String.valueOf(row.get(4)))); 130 query.setTempDepthThresh(Double.parseDouble(String.valueOf(row.get(5)))); 131 query.setLevelThresh(Integer.parseInt(String.valueOf(row.get(6)))); 132 query.setSequenceSelection(Integer.parseInt(String.valueOf(row.get(7)))); 133 query.setNumPositions(Integer.parseInt(String.valueOf(row.get(8)))); 134 query.setSeed(Long.parseLong(String.valueOf(row.get(9)))); 135 136 // leaves, partitions, and postitions are stored as 137 // ';' delimited strings in the table so they need to 138 // be parsed when extracted from the table 139 // ****************************************************** 140 // These MUST be uncommented when any query contains a CLOB 141 // ****************************************************** 142 // query.setLeaves(Database.readClobSet(row.get(10))); 143 // query.setPartitions(Database.readClobSet(row.get(11))); 144 // query.setPositions(Database.readClobSet(row.get(12))); 145 146 query.setSaved(true); // saved since we just got it from the db 147 } 148 // UNCOMMENT THIS WHEN CLOBS ARE RESTORED 149 // } catch (SQLException e) { 150 } catch (Exception e) { 151 CrimsonUtils.printError("SQL error loading CLOB in QUERIES."); 152 CrimsonUtils.printError(e.getMessage()); 153 return; 154 } 155 */ 156 } 157 158 /** 159 * This will publish the specified query in the QUERIES table in the 160 * current database. It will overwrite any query already in the 161 * table with the same name. 162 */ 163 public static void publish(Query query) { 164 if (query == null) { 165 CrimsonUtils.printError("Unable to publish 'null' query ."); 166 return; 167 } 168 169 publish(query, query.getID(), true); 170 } 171 172 /** 173 * This will publish the specified query in the QUERIES table in 174 * the current database. If 'overwrite' is true then this will 175 * replace any query object with the same name, already in the 176 * table. If 'id' is not empty, then it will be used, instead of 177 * query.id, as the name of the query to be published 178 * (ie. QUERIES.id). 179 */ 180 public static void publish(Query query, String id, boolean overwrite) { 181 if (query == null) { 182 CrimsonUtils.printError("Unable to publish 'null' query ."); 183 return; 184 } 185 186 // make sure a database is open. We don't need to check for 187 // the existance of the QUERIES database because this was done 188 // when the database was opened. 189 if (! Database.isOpen()) { 190 CrimsonUtils.printError("Can't publish query, no open database."); 191 return; 192 } 193 194 // use 'id' if present, otherwise query.id 195 if (CrimsonUtils.isEmpty(id)) id = query.getID(); 196 197 String sql; 198 // if query already exists then update, else insert 199 if (dbContains(id)) { 200 // if 'overwrite' is false then exit on error 201 if (! overwrite) { 202 CrimsonUtils.printError("Query already exists in database."); 203 return; 204 } 205 206 // update existing query 207 sql = "UPDATE QUERIES SET"; 208 sql += " notes = '" + query.getNotes() + "'"; 209 sql += " , tree_id = '" + query.getTreeID() + "'"; 210 sql += " , leaf_select = " + query.getLeafSelection(); 211 sql += " , num_leaves = " + query.getNumLeaves(); 212 sql += " , temp_depth_thresh = " + query.getTempDepthThresh(); 213 sql += " , level_thresh = " + query.getLevelThresh(); 214 sql += " , seq_select = " + query.getSequenceSelection(); 215 sql += " , num_pos = " + query.getNumPositions(); 216 sql += " , seed = " + query.getSeed(); 217 sql += " WHERE id = '" + id.toUpperCase() + "' "; 218 219 } else { // insert new query 220 sql = "INSERT INTO QUERIES "; 221 sql += "(id, tree_id, notes, leaf_select, num_leaves, temp_depth_thresh, level_thresh"; 222 sql += ", seq_select, num_pos, seed) VALUES ('" + id + "'"; 223 sql += ", '" + query.getTreeID() + "'"; 224 sql += ", '" + query.getNotes() + "'"; 225 sql += ", " + query.getLeafSelection(); 226 sql += ", " + query.getNumLeaves(); 227 sql += ", " + query.getTempDepthThresh(); 228 sql += ", " + query.getLevelThresh(); 229 sql += ", " + query.getSequenceSelection(); 230 sql += ", " + query.getNumPositions(); 231 sql += ", " + query.getSeed() + ")"; 232 } 233 234 // UPDATE the VARCHAR2 and NUMBER fields 235 if (! Database.execUpdate(sql)) { 236 CrimsonUtils.printError("Query NOT published."); 237 return; 238 } 239 240 // UPDATE the CLOBs 241 // leaves, partitions, and postitions are stored as ';' 242 // delimited strings in the table so they need to be 243 // parsed when added to the table 244 if (! Database.writeClobSet("QUERIES", id, "leaves", query.getLeaves())) { 245 CrimsonUtils.printError("Can't update leaves. Query NOT published."); 246 return; 247 } 248 if (! Database.writeClobSet("QUERIES", id, "partitions", query.getPartitions())) { 249 CrimsonUtils.printError("Can't update partitions. Query NOT published."); 250 return; 251 } 252 if (! Database.writeClobSet("QUERIES", id, "positions", query.getPositions())) { 253 CrimsonUtils.printError("Can't update positions. Query NOT published."); 254 return; 255 } 256 257 query.setSaved(true); // saved since we just publised it to the db 258 259 // if we get this far then we didn't error anywhere 260 CrimsonUtils.printMsg("Query published."); 261 } 262 263 /** 264 * This will return an ArrayList with id's for all queries in the 265 * QUERIES table. 266 */ 267 public static ArrayList dbList() { 268 if (! Database.isOpen()) { 269 CrimsonUtils.printError("No database open."); 270 return null; 271 } 272 273 String sql = "SELECT id FROM QUERIES"; 274 return Database.sqlToArray(sql); 275 } 276 277 /** 278 * This will return true if a query exists in QUERIES with an id 279 * equal to 'id'. 280 */ 281 public static boolean dbContains(String id) { 282 ArrayList out = Database.sqlToArray("SELECT id FROM QUERIES WHERE id = '" + id.toUpperCase() + "'"); 283 if ((out == null) || (out.size() == 0)) return false; 284 else return true; 285 } 286 287 /** Removes a query from the database. */ 288 public static boolean delete(String id) { 289 boolean val = Database.delete("QUERIES", id); 290 291 // remove the query from the relevant lists 292 ObjectHandles.deleteQuery(id); 293 294 return val; 295 } 296 297 /** 298 * This will repeatedly run a query, generating a NEXUS file for 299 * each run. The output will be a NEXUS file that includes the 300 * Crimson and tree blocks. Query notes will not be included. 301 * @param queryID string ID for query to be run 302 * @param filename filename for NEXUS output 303 * @param incSequence when true, inner sequence IDs will be 304 * included in the NEXUS file 305 * @param numRuns number of runs 306 */ 307 public static void run(String queryID, String filename, boolean incSequence, int numRuns) { 308 if (CrimsonUtils.isEmpty(filename)) { 309 CrimsonUtils.printError("No NEXUS file specified."); 310 return; 311 } 312 313 if (numRuns > 1) { 314 Query query = ObjectHandles.getQuery(queryID); 315 if (query == null) { 316 CrimsonUtils.printError("Query not found."); 317 return; 318 } 319 if (query.getSeed() > -1) { 320 CrimsonUtils.printError("Can not run query repetitions when query seed is set."); 321 return; 322 } 323 324 for (int i = 0; i < numRuns; i++) { 325 run(queryID, filename + "_" + i, incSequence, true, 1, false); 326 } 327 } else { 328 run(queryID, filename, incSequence, true, 1, false); 329 } 330 } 331 332 /* 333 // This will perform a query of a tree. *** REPLACED with expanded version below 334 public static void run(String queryID, String filename, boolean incSequence) { 335 if (! Database.isOpen()) { 336 CrimsonUtils.printError("Must open a database before running a query."); 337 return; 338 } 339 340 if (CrimsonUtils.isEmpty(filename)) { 341 CrimsonUtils.printError("No NEXUS file specified."); 342 return; 343 } 344 File file = new File(filename); 345 // if the file already exists and not supposed to overwrite 346 // it, then return on error. 347 348 // if (file.exists()) { 349 // CrimsonUtils.printError("File \"" + filename + "\" already exists."); 350 // return; 351 // } 352 353 Query query = ObjectHandles.getQuery(queryID); 354 if (query == null) { 355 CrimsonUtils.printError("Query not found."); 356 return; 357 } 358 359 if (CrimsonUtils.isEmpty(query.getTreeID())) { 360 CrimsonUtils.printError("Tree not specified within query."); 361 return; 362 } 363 364 Tree treeOrig = ObjectHandles.getTree(query.getTreeID()); 365 if (treeOrig == null) { 366 CrimsonUtils.printError("Tree does not exist: " + query.getTreeID()); 367 return; 368 } 369 370 // set seed prior to sampling tree 371 if (query.getSeed() > -1) CrimsonUtils.setRandomSeed(query.getSeed()); 372 373 // use Tree() routines to process the leaf selection. With 374 // the new tree we can get the new newick string and species. 375 Tree treeNew; 376 switch (query.getLeafSelection()) { 377 case 1: // random (numLeaves) 378 treeNew = treeOrig.randomSample(query.getNumLeaves()); 379 if (treeNew == null) return; 380 treeNew.computeStats(); 381 break; 382 case 2: // depth distributed (numLeaves, tempDepthThresh) 383 treeNew = treeOrig.uniformSampleByTempDepth(query.getTempDepthThresh(), query.getNumLeaves()); 384 if (treeNew == null) return; 385 treeNew.computeStats(); 386 break; 387 case 3: // depth weighted (numLeaves, tempDepthThresh) 388 treeNew = treeOrig.weightedSampleByTempDepth(query.getTempDepthThresh(), query.getNumLeaves()); 389 if (treeNew == null) return; 390 treeNew.computeStats(); 391 break; 392 case 4: // level distributed (numLeaves, levelThresh) 393 treeNew = treeOrig.uniformSampleByLevel(query.getLevelThresh(), query.getNumLeaves()); 394 if (treeNew == null) return; 395 treeNew.computeStats(); 396 break; 397 case 5: // level weighted (numLeaves, levelThresh) 398 treeNew = treeOrig.weightedSampleByLevel(query.getLevelThresh(), query.getNumLeaves()); 399 if (treeNew == null) return; 400 treeNew.computeStats(); 401 break; 402 case 6: // manual (leaves) 403 treeNew = treeOrig.manualSampleByID(new ArrayList(query.getLeaves())); 404 if (treeNew == null) return; 405 treeNew.computeStats(); 406 break; 407 default: 408 // this is equivalent to "case 0". if tree not already 409 // built, then build it now 410 if (! treeOrig.isBuilt()) treeOrig.computeStats(); 411 treeNew = treeOrig; 412 } 413 414 // make sure the tree is valid 415 if (treeNew.getNumSpecies() < 1) { 416 CrimsonUtils.printError("Sampled tree has no leaves."); 417 return; 418 } 419 420 String nchar; 421 ArrayList partitions; // this is an array to preserve order 422 423 // used by seq selection methods 1-4, this will contain a list 424 // of bp positions to be included in the output. these are 425 // arrays so that they can differ per partition, in case 426 // partitions very in size. 427 ArrayList sampledPositions = new ArrayList(); 428 429 // this will contain the partition data printed at the end of 430 // multi-partition NEXUS files 431 StringBuffer charPartition = new StringBuffer(); 432 433 ///////////// 434 // get the partitions to be included and the values for nchar, 435 // sampledPositions, and CharPartition 436 if ((query.getSequenceSelection() == 5) || (query.isOnlyStruct())) { 437 // only structure, so no partitions to be included 438 partitions = new ArrayList(); 439 nchar = "0"; 440 } else { 441 // figure out partition data to include 442 partitions = new ArrayList(query.getPartitions()); 443 if ((partitions == null) || (partitions.size() == 0)) { 444 // no partitions specificed, so assume 'ALL'. 445 partitions = Database.sqlToArray("SELECT id FROM PARTITIONS WHERE tree_id = '" + treeOrig.getID() + "'"); 446 if (partitions == null) partitions = new ArrayList(); 447 } 448 449 TreeSet expandedPositions = null; 450 if ((query.getSequenceSelection() == 3) || (query.getSequenceSelection() == 4)) { 451 // for manual selections, need to expand position 452 // string to figure. this is independent of the 453 // partition 454 expandedPositions = expandPositions(query.getPositions(), (query.getSequenceSelection() == 3)); 455 // if not able to expand then stop 456 if (expandedPositions == null) return; 457 } 458 459 // used to count num positions across partitions 460 int iLength = 0; 461 462 // set up the array such that each partition will have 463 // it's own set of sampled positions 464 sampledPositions = new ArrayList(); 465 466 // make sure all partitions are valid and compute relevant 467 // sampling positions 468 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 469 String partitionID = (String) partitions.get(iPartition); 470 471 // shouldn't need to test both database and partitionPool, but 472 // it's better to be careful 473 if ((! Partitions.dbContains(partitionID)) || (! ObjectHandles.containsPartition(partitionID))) { 474 CrimsonUtils.printError("Partition doesn't exists: " + partitionID); 475 return; 476 } 477 478 // get partition length 479 int pLength = (ObjectHandles.getPartition(partitionID)).getLength(); 480 if (pLength <= 0) { 481 CrimsonUtils.printError("Partition (" + partitionID + ") has invalid length: " + pLength); 482 return; 483 } 484 485 if ((query.getSequenceSelection() == 3) || (query.getSequenceSelection() == 4)) { 486 // manual selection 487 if (partitions.size() > 1) { 488 // manual selection is only valid for a single 489 // data partition. 490 CrimsonUtils.printError("Can't manually select positions when using more than one data partition."); 491 return; 492 } 493 494 // add valid positions to sampledPositions. Do 495 // this for every partition since partitions will 496 // have different lengths. Thus a valid position 497 // for one partition may be invalid for 498 // another. HOWEVER, only allow one partition to 499 // be selected, so doing this here should not be 500 // necessary. 501 TreeSet sampPos = new TreeSet(); 502 for (Iterator iExpPos = expandedPositions.iterator(); iExpPos.hasNext();) { 503 Integer indx = (Integer) iExpPos.next(); 504 if (indx.intValue() >= pLength) { 505 CrimsonUtils.printWarning("Ignoring out of bounds position:" + indx); 506 } else { 507 sampPos.add(indx); 508 } 509 } 510 511 // with manual selection, nchar is number of positions 512 iLength = sampPos.size(); 513 514 sampledPositions.add(sampPos); 515 516 } else { 517 // compute random positions to be sampled. We do 518 // this for each partition because the partition 519 // lengths may be different. 520 if (query.getSequenceSelection() == 1) { // random codon sampling 521 sampledPositions.add(getRandomPositions(pLength, query.getNumPositions(), true)); 522 } else if (query.getSequenceSelection() == 2) { // random bp sampling 523 sampledPositions.add(getRandomPositions(pLength, query.getNumPositions(), false)); 524 } 525 526 // this is the case for seq selection 1 and 2 527 int numPositions = query.getNumPositions(); 528 // seq selection 0 is select all 529 if (query.getSequenceSelection() == 0) numPositions = pLength; 530 531 // count num of positions to include per partition 532 if (pLength < numPositions) { 533 // warn user that partition length is smaller 534 // than requested number of positions 535 String msg = "Partition '" + partitionID + "' does not contain '"; 536 msg += numPositions + "' positions as requested."; 537 CrimsonUtils.printWarning(msg); 538 539 // setup charPartition string for printing at 540 // end of NEXUS file 541 charPartition.append(" " + NexusFile.quote(partitionID) + ":"); 542 charPartition.append((iLength + 1) + "-" + (iLength + pLength) + "\n"); 543 charPartition.append((iPartition + 1 < partitions.size()) ? "," : ";"); 544 545 iLength += pLength; 546 } else { 547 // setup charPartition string for printing at 548 // end of NEXUS file 549 charPartition.append(" " + NexusFile.quote(partitionID) + ":"); 550 charPartition.append((iLength + 1) + "-" + (iLength + numPositions) + "\n"); 551 charPartition.append((iPartition + 1 < partitions.size()) ? "," : ";"); 552 553 iLength += numPositions; 554 } 555 } 556 } 557 558 // convert integer nchar value to string which will be 559 // used below 560 nchar = String.valueOf(iLength); 561 } 562 563 //////////////////////////////// 564 // begin writting the NEXUS file 565 //////////////////////////////// 566 try { 567 FileWriter fWriter = new FileWriter(file); 568 BufferedWriter bWriter = new BufferedWriter(fWriter); 569 570 bWriter.write("#NEXUS\n\n"); 571 572 // include query as comments at top of NEXUS file 573 bWriter.write("[\nCRIMSON QUERY:\n"); 574 bWriter.write(query.toString()); 575 bWriter.write("]\n\n"); 576 577 ArrayList species = incSequence ? treeNew.getSpeciesList() : treeNew.getLeaves(); 578 int ntax = species.size(); 579 580 // if no partitions then no data 581 if (partitions.size() > 0) { 582 // print character data header 583 bWriter.write("BEGIN DATA;\n"); 584 bWriter.write(" DIMENSIONS NTAX=" + ntax + " NCHAR=" + nchar + ";\n"); 585 if (partitions.size() > 1) bWriter.write(" FORMAT DATATYPE=rna GAP=- INTERLEAVE;\n"); 586 else bWriter.write(" FORMAT DATATYPE=rna GAP=-;\n"); 587 bWriter.write(" MATRIX\n"); 588 589 // process sequence data 590 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 591 String partitionID = (String) partitions.get(iPartition); 592 593 // if more than 1 partition, then add a comment 594 // containing the name of the partition to help 595 // deliniate the interleaved sections 596 if (partitions.size() > 1) bWriter.write(" [" + partitionID.toUpperCase() + "]\n"); 597 598 // process species in chunks of size SPECIES_CHUNK 599 for (int iSpecies = 0; iSpecies < ntax; iSpecies += SPECIES_CHUNK) { 600 int endChunk = iSpecies + SPECIES_CHUNK; 601 if (endChunk > ntax) endChunk = ntax; 602 603 // build query 604 String sql = "SELECT species_id, sequence FROM PART_DATA WHERE partition_id = '" + partitionID + "'"; 605 sql += " and (species_id = '" + ((Species) species.get(iSpecies)).getID() + "'"; 606 for (int iChunk = iSpecies + 1; iChunk < endChunk; iChunk++) { 607 sql += " or species_id = '" + ((Species) species.get(iChunk)).getID() + "'"; 608 } 609 sql += ")"; 610 611 ArrayList results = Database.sqlToArray(sql); 612 613 if (results == null) { 614 CrimsonUtils.printError("NEXUS file incomplete. Unable to get partition data: " + sql); 615 bWriter.flush(); 616 bWriter.close(); 617 fWriter.close(); 618 return; 619 } 620 621 for (Iterator rows = results.iterator(); rows.hasNext();) { 622 ArrayList row = (ArrayList) rows.next(); 623 String data = Database.readClob(row.get(1)); 624 625 if ((sampledPositions.size() > 0) && (sampledPositions.get(iPartition) != null)) { 626 // only include positions in sampledPositions[] 627 StringBuffer sampledData = new StringBuffer(); 628 TreeSet sampPos = (TreeSet) sampledPositions.get(iPartition); 629 for (Iterator i = sampPos.iterator(); i.hasNext();) { 630 sampledData.append(data.charAt(((Integer) i.next()).intValue())); 631 } 632 data = sampledData.toString(); 633 } 634 bWriter.write(" " + String.valueOf(row.get(0)) + " " + data + "\n"); 635 } 636 } 637 638 // put in a space between partitions 639 bWriter.write("\n"); 640 } 641 bWriter.write(";\nEND;\n\n"); 642 643 // print crimson data header 644 bWriter.write("BEGIN CRIMSON;\n"); 645 bWriter.write(" MATRIX\n"); 646 647 // process structure data 648 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 649 String partitionID = (String) partitions.get(iPartition); 650 651 // if more than 1 partition, then add a comment 652 // containing the name of the partition to help 653 // deliniate the interleaved sections 654 if (partitions.size() > 1) bWriter.write(" [" + partitionID.toUpperCase() + "]\n"); 655 656 // process species in chunks of size SPECIES_CHUNK 657 for (int iSpecies = 0; iSpecies < ntax; iSpecies += SPECIES_CHUNK) { 658 int endChunk = iSpecies + SPECIES_CHUNK; 659 if (endChunk > ntax) endChunk = ntax; 660 661 // build query 662 String sql = "SELECT species_id, structure FROM PART_DATA WHERE partition_id = '" + partitionID + "'"; 663 sql += " and (species_id = '" + ((Species) species.get(iSpecies)).getID() + "'"; 664 for (int iChunk = iSpecies + 1; iChunk < endChunk; iChunk++) { 665 sql += " or species_id = '" + ((Species) species.get(iChunk)).getID() + "'"; 666 } 667 sql += ")"; 668 669 ArrayList results = Database.sqlToArray(sql); 670 671 if (results == null) { 672 CrimsonUtils.printError("NEXUS file incomplete. Unable to get partition data: " + sql); 673 bWriter.flush(); 674 bWriter.close(); 675 fWriter.close(); 676 return; 677 } 678 679 for (Iterator rows = results.iterator(); rows.hasNext();) { 680 ArrayList row = (ArrayList) rows.next(); 681 String data = Database.readClob(row.get(1)); 682 683 if ((sampledPositions.size() > 0) && (sampledPositions.get(iPartition) != null)) { 684 // only include positions in sampledPositions[] 685 StringBuffer sampledData = new StringBuffer(); 686 TreeSet sampPos = (TreeSet) sampledPositions.get(iPartition); 687 for (Iterator i = sampPos.iterator(); i.hasNext();) { 688 sampledData.append(data.charAt(((Integer) i.next()).intValue())); 689 } 690 data = sampledData.toString(); 691 } 692 bWriter.write(" " + String.valueOf(row.get(0)) + " " + data + "\n"); 693 } 694 } 695 696 // put in a space between partitions 697 bWriter.write("\n"); 698 } 699 bWriter.write(";\nEND;\n\n"); 700 701 if (partitions.size() > 1) { 702 bWriter.write("BEGIN SETS;\n"); 703 bWriter.write(" CharPartition partition = \n"); 704 bWriter.write(charPartition.toString()); 705 bWriter.write("END;\n"); 706 } 707 } else { 708 // the taxa are currently being loaded from the DATA 709 // block. if there is no DATA block then we need to 710 // add the TAXA block so that the TREES block will bel 711 // properly loaded. We currently only include the 712 // TAXA block if no DATA block, because we can't 713 // easily control the order of the species in the DATA 714 // block and this order needs to be the same in the 715 // DATA and TAXA blocks. 716 717 // insert TAXA block 718 bWriter.write("BEGIN TAXA;\n"); 719 bWriter.write(" DIMENSIONS NTAX=" + ntax + ";\n"); 720 bWriter.write(" TAXLABELS\n"); 721 for (int iSpecies = 0; iSpecies < ntax; iSpecies++) { 722 bWriter.write(" " + ((Species) species.get(iSpecies)).getID() + "\n"); 723 } 724 bWriter.write(";\nEND;\n\n"); 725 } 726 727 // insert TREES block. This is placed after the data so 728 // that the taxa can be loaded from the DATA block. 729 bWriter.write("BEGIN TREES;\n"); 730 // make sure queryID is properly quoted 731 bWriter.write(" TREE " + NexusFile.quote(queryID) + " = " + treeNew.toNewick(incSequence) + ";\n"); 732 bWriter.write("END;\n\n"); 733 734 bWriter.flush(); 735 bWriter.close(); 736 fWriter.close(); 737 } catch (SQLException e) { 738 CrimsonUtils.printError("SQL error processing CLOB."); 739 CrimsonUtils.printError(e.getMessage()); 740 return; 741 } catch (FileNotFoundException e) { // problem with FileOutputStream 742 CrimsonUtils.printError("File \"" + filename + "\" can not be opened."); 743 CrimsonUtils.printError(e.getMessage()); 744 return; 745 } catch (IOException e) { // problem with ObjectOutputStream. 746 // XXX do we need to close bWriter()? 747 CrimsonUtils.printError("Error writting output file \"" + filename + "\"."); 748 CrimsonUtils.printError(e.getMessage()); 749 return; 750 } 751 752 CrimsonUtils.printMsg("Query finished."); 753 } 754 */ 755 756 /** 757 * This will perform a tree query, generating a NEXUS file. If 758 * Query.onlyStruct is true then incCrimson and incTree will be 759 * ignored. 760 * @param queryID string ID for query to be run 761 * @param filename filename for NEXUS output 762 * @param incSequence when true, inner sequence IDs will be 763 * included in the NEXUS file 764 * @param incCrimson when true, the CRIMSON block will be included 765 * in the NEXUS file 766 * @param incTree There are multiple options dictated by incTree: 0 = no 767 * tree will be output; 1 = tree is included in NEXUS file; 2 = 768 * tree ouptut to second NEXUS file ("*.tree"); 3 = rooted tree 769 * ouptut to a newick file ("*.newick"); 4 = unrooted tree ouptut 770 * to a newick file ("*.newick") 771 * @param incNotes When true, the Query.notes will be appended 772 * to the end of the NEXUS file. The notes can, for example, 773 * contain a PAUP block. It is assumed that the notes are properly 774 * formatted for the NEXUS file. 775 * @return The found tree object is returned (null when errors). 776 */ 777 public static Tree run(String queryID, String filename, boolean incSequence, boolean incCrimson, 778 int incTree, boolean incNotes) { 779 if (! Database.isOpen()) { 780 CrimsonUtils.printError("Must open a database before running a query."); 781 return null; 782 } 783 784 if (CrimsonUtils.isEmpty(filename)) { 785 CrimsonUtils.printError("No NEXUS file specified."); 786 return null; 787 } 788 File file = new File(filename); 789 790 Query query = ObjectHandles.getQuery(queryID); 791 if (query == null) { 792 CrimsonUtils.printError("Query not found."); 793 return null; 794 } 795 796 if (CrimsonUtils.isEmpty(query.getTreeID())) { 797 CrimsonUtils.printError("Tree not specified within query."); 798 return null; 799 } 800 801 Tree treeOrig = ObjectHandles.getTree(query.getTreeID()); 802 if (treeOrig == null) { 803 CrimsonUtils.printError("Tree does not exist: " + query.getTreeID()); 804 return null; 805 } 806 807 // set seed prior to sampling tree 808 if (query.getSeed() > -1) CrimsonUtils.setRandomSeed(query.getSeed()); 809 810 // use Tree() routines to process the leaf selection. With 811 // the new tree we can get the new newick string and species. 812 Tree treeNew; 813 switch (query.getLeafSelection()) { 814 case 1: // random (numLeaves) 815 treeNew = treeOrig.randomSample(query.getNumLeaves()); 816 if (treeNew == null) return null; 817 treeNew.computeStats(); 818 break; 819 case 2: // depth distributed (numLeaves, tempDepthThresh) 820 treeNew = treeOrig.uniformSampleByTempDepth(query.getTempDepthThresh(), query.getNumLeaves()); 821 if (treeNew == null) return null; 822 treeNew.computeStats(); 823 break; 824 case 3: // depth weighted (numLeaves, tempDepthThresh) 825 treeNew = treeOrig.weightedSampleByTempDepth(query.getTempDepthThresh(), query.getNumLeaves()); 826 if (treeNew == null) return null; 827 treeNew.computeStats(); 828 break; 829 case 4: // level distributed (numLeaves, levelThresh) 830 treeNew = treeOrig.uniformSampleByLevel(query.getLevelThresh(), query.getNumLeaves()); 831 if (treeNew == null) return null; 832 treeNew.computeStats(); 833 break; 834 case 5: // level weighted (numLeaves, levelThresh) 835 treeNew = treeOrig.weightedSampleByLevel(query.getLevelThresh(), query.getNumLeaves()); 836 if (treeNew == null) return null; 837 treeNew.computeStats(); 838 break; 839 case 6: // manual (leaves) 840 treeNew = treeOrig.manualSampleByID(new ArrayList(query.getLeaves())); 841 if (treeNew == null) return null; 842 treeNew.computeStats(); 843 break; 844 default: 845 // this is equivalent to "case 0". if tree not already 846 // built, then build it now 847 if (! treeOrig.isBuilt()) treeOrig.computeStats(); 848 treeNew = treeOrig; 849 } 850 851 // make sure the tree is valid 852 if (treeNew.getNumSpecies() < 1) { 853 CrimsonUtils.printError("Sampled tree has no leaves."); 854 return null; 855 } 856 857 String nchar; 858 ArrayList partitions; // this is an array to preserve order 859 860 // used by seq selection methods 1-4, this will contain a list 861 // of bp positions to be included in the output. these are 862 // arrays so that they can differ per partition, in case 863 // partitions very in size. 864 ArrayList sampledPositions = new ArrayList(); 865 866 // this will contain the partition data printed at the end of 867 // multi-partition NEXUS files 868 StringBuffer charPartition = new StringBuffer(); 869 870 ///////////// 871 // get the partitions to be included and the values for nchar, 872 // sampledPositions, and CharPartition 873 if ((query.getSequenceSelection() == 5) || (query.isOnlyStruct())) { 874 // only structure, so no partitions to be included 875 partitions = new ArrayList(); 876 nchar = "0"; 877 } else { 878 // figure out partition data to include 879 partitions = new ArrayList(query.getPartitions()); 880 if ((partitions == null) || (partitions.size() == 0)) { 881 // no partitions specificed, so assume 'ALL'. 882 partitions = Database.sqlToArray("SELECT id FROM PARTITIONS WHERE tree_id = '" + treeOrig.getID() + "'"); 883 if (partitions == null) partitions = new ArrayList(); 884 } 885 886 TreeSet expandedPositions = null; 887 if ((query.getSequenceSelection() == 3) || (query.getSequenceSelection() == 4)) { 888 // for manual selections, need to expand position 889 // string to figure. this is independent of the 890 // partition 891 expandedPositions = expandPositions(query.getPositions(), (query.getSequenceSelection() == 3)); 892 // if not able to expand then stop 893 if (expandedPositions == null) return null; 894 } 895 896 // used to count num positions across partitions 897 int iLength = 0; 898 899 // set up the array such that each partition will have 900 // it's own set of sampled positions 901 sampledPositions = new ArrayList(); 902 903 // make sure all partitions are valid and compute relevant 904 // sampling positions 905 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 906 String partitionID = (String) partitions.get(iPartition); 907 908 // shouldn't need to test both database and partitionPool, but 909 // it's better to be careful 910 if ((! Partitions.dbContains(partitionID)) || (! ObjectHandles.containsPartition(partitionID))) { 911 CrimsonUtils.printError("Partition doesn't exists: " + partitionID); 912 return null; 913 } 914 915 // get partition length 916 int pLength = (ObjectHandles.getPartition(partitionID)).getLength(); 917 if (pLength <= 0) { 918 CrimsonUtils.printError("Partition (" + partitionID + ") has invalid length: " + pLength); 919 return null; 920 } 921 922 if ((query.getSequenceSelection() == 3) || (query.getSequenceSelection() == 4)) { 923 // manual selection 924 if (partitions.size() > 1) { 925 // manual selection is only valid for a single 926 // data partition. 927 CrimsonUtils.printError("Can't manually select positions when using more than one data partition."); 928 return null; 929 } 930 931 // add valid positions to sampledPositions. Do 932 // this for every partition since partitions will 933 // have different lengths. Thus a valid position 934 // for one partition may be invalid for 935 // another. HOWEVER, only allow one partition to 936 // be selected, so doing this here should not be 937 // necessary. 938 TreeSet sampPos = new TreeSet(); 939 for (Iterator iExpPos = expandedPositions.iterator(); iExpPos.hasNext();) { 940 Integer indx = (Integer) iExpPos.next(); 941 if (indx.intValue() >= pLength) { 942 CrimsonUtils.printWarning("Ignoring out of bounds position:" + indx); 943 } else { 944 sampPos.add(indx); 945 } 946 } 947 948 // with manual selection, nchar is number of positions 949 iLength = sampPos.size(); 950 951 sampledPositions.add(sampPos); 952 953 } else { 954 // compute random positions to be sampled. We do 955 // this for each partition because the partition 956 // lengths may be different. 957 if (query.getSequenceSelection() == 1) { // random codon sampling 958 sampledPositions.add(getRandomPositions(pLength, query.getNumPositions(), true)); 959 } else if (query.getSequenceSelection() == 2) { // random bp sampling 960 sampledPositions.add(getRandomPositions(pLength, query.getNumPositions(), false)); 961 } 962 963 // this is the case for seq selection 1 and 2 964 int numPositions = query.getNumPositions(); 965 // seq selection 0 is select all 966 if (query.getSequenceSelection() == 0) numPositions = pLength; 967 968 // count num of positions to include per partition 969 if (pLength < numPositions) { 970 // warn user that partition length is smaller 971 // than requested number of positions 972 String msg = "Partition '" + partitionID + "' does not contain '"; 973 msg += numPositions + "' positions as requested."; 974 CrimsonUtils.printWarning(msg); 975 976 // setup charPartition string for printing at 977 // end of NEXUS file 978 charPartition.append(" " + NexusFile.quote(partitionID) + ":"); 979 charPartition.append((iLength + 1) + "-" + (iLength + pLength) + "\n"); 980 charPartition.append((iPartition + 1 < partitions.size()) ? "," : ";"); 981 982 iLength += pLength; 983 } else { 984 // setup charPartition string for printing at 985 // end of NEXUS file 986 charPartition.append(" " + NexusFile.quote(partitionID) + ":"); 987 charPartition.append((iLength + 1) + "-" + (iLength + numPositions) + "\n"); 988 charPartition.append((iPartition + 1 < partitions.size()) ? "," : ";"); 989 990 iLength += numPositions; 991 } 992 } 993 } 994 995 // convert integer nchar value to string which will be 996 // used below 997 nchar = String.valueOf(iLength); 998 } 999 1000 //////////////////////////////// 1001 // begin writting the NEXUS file 1002 //////////////////////////////// 1003 try { 1004 FileWriter fWriter = new FileWriter(file); 1005 BufferedWriter bWriter = new BufferedWriter(fWriter); 1006 1007 bWriter.write("#NEXUS\n\n"); 1008 1009 // include query as comments at top of NEXUS file 1010 bWriter.write("[\nCRIMSON QUERY:\n"); 1011 bWriter.write(query.toString()); 1012 bWriter.write("]\n\n"); 1013 1014 ArrayList species = incSequence ? treeNew.getSpeciesList() : treeNew.getLeaves(); 1015 int ntax = species.size(); 1016 1017 // if no partitions then no data 1018 if (partitions.size() > 0) { 1019 // print character data header 1020 bWriter.write("BEGIN DATA;\n"); 1021 bWriter.write(" DIMENSIONS NTAX=" + ntax + " NCHAR=" + nchar + ";\n"); 1022 if (partitions.size() > 1) bWriter.write(" FORMAT DATATYPE=rna GAP=- INTERLEAVE;\n"); 1023 else bWriter.write(" FORMAT DATATYPE=rna GAP=-;\n"); 1024 bWriter.write(" MATRIX\n"); 1025 1026 // process sequence data 1027 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 1028 String partitionID = (String) partitions.get(iPartition); 1029 1030 // if more than 1 partition, then add a comment 1031 // containing the name of the partition to help 1032 // deliniate the interleaved sections 1033 if (partitions.size() > 1) bWriter.write(" [" + partitionID.toUpperCase() + "]\n"); 1034 1035 // process species in chunks of size SPECIES_CHUNK 1036 for (int iSpecies = 0; iSpecies < ntax; iSpecies += SPECIES_CHUNK) { 1037 int endChunk = iSpecies + SPECIES_CHUNK; 1038 if (endChunk > ntax) endChunk = ntax; 1039 1040 // build query 1041 String sql = "SELECT species_id, sequence FROM PART_DATA WHERE partition_id = '" + partitionID + "'"; 1042 sql += " and (species_id = '" + ((Species) species.get(iSpecies)).getID() + "'"; 1043 for (int iChunk = iSpecies + 1; iChunk < endChunk; iChunk++) { 1044 sql += " or species_id = '" + ((Species) species.get(iChunk)).getID() + "'"; 1045 } 1046 sql += ")"; 1047 1048 // *** DEBUGGING *** 1049 //FileIO.writeString(file+".debug", true, sql + "\n"); 1050 // *** DEBUGGING *** 1051 1052 ArrayList results = Database.sqlToArray(sql); 1053 1054 if (results == null) { 1055 CrimsonUtils.printError("NEXUS file incomplete. Unable to get partition data: " + sql); 1056 bWriter.flush(); 1057 bWriter.close(); 1058 fWriter.close(); 1059 return null; 1060 } 1061 1062 for (Iterator rows = results.iterator(); rows.hasNext();) { 1063 ArrayList row = (ArrayList) rows.next(); 1064 String data = Database.readClob(row.get(1)); 1065 1066 if ((sampledPositions.size() > 0) && (sampledPositions.get(iPartition) != null)) { 1067 // only include positions in sampledPositions[] 1068 StringBuffer sampledData = new StringBuffer(); 1069 TreeSet sampPos = (TreeSet) sampledPositions.get(iPartition); 1070 for (Iterator i = sampPos.iterator(); i.hasNext();) { 1071 sampledData.append(data.charAt(((Integer) i.next()).intValue())); 1072 } 1073 data = sampledData.toString(); 1074 } 1075 bWriter.write(" " + String.valueOf(row.get(0)) + " " + data + "\n"); 1076 } 1077 } 1078 1079 // put in a space between partitions 1080 bWriter.write("\n"); 1081 } 1082 bWriter.write(";\nEND;\n\n"); 1083 // done printing DATA block 1084 1085 if (incCrimson) { 1086 // print crimson data header 1087 bWriter.write("BEGIN CRIMSON;\n"); 1088 bWriter.write(" MATRIX\n"); 1089 1090 // process structure data 1091 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 1092 String partitionID = (String) partitions.get(iPartition); 1093 1094 // if more than 1 partition, then add a comment 1095 // containing the name of the partition to help 1096 // deliniate the interleaved sections 1097 if (partitions.size() > 1) bWriter.write(" [" + partitionID.toUpperCase() + "]\n"); 1098 1099 // process species in chunks of size SPECIES_CHUNK 1100 for (int iSpecies = 0; iSpecies < ntax; iSpecies += SPECIES_CHUNK) { 1101 int endChunk = iSpecies + SPECIES_CHUNK; 1102 if (endChunk > ntax) endChunk = ntax; 1103 1104 // build query 1105 String sql = "SELECT species_id, structure FROM PART_DATA WHERE partition_id = '" + partitionID + "'"; 1106 sql += " and (species_id = '" + ((Species) species.get(iSpecies)).getID() + "'"; 1107 for (int iChunk = iSpecies + 1; iChunk < endChunk; iChunk++) { 1108 sql += " or species_id = '" + ((Species) species.get(iChunk)).getID() + "'"; 1109 } 1110 sql += ")"; 1111 1112 ArrayList results = Database.sqlToArray(sql); 1113 1114 if (results == null) { 1115 CrimsonUtils.printError("NEXUS file incomplete. Unable to get partition data: " + sql); 1116 bWriter.flush(); 1117 bWriter.close(); 1118 fWriter.close(); 1119 return null; 1120 } 1121 1122 for (Iterator rows = results.iterator(); rows.hasNext();) { 1123 ArrayList row = (ArrayList) rows.next(); 1124 String data = Database.readClob(row.get(1)); 1125 1126 if ((sampledPositions.size() > 0) && (sampledPositions.get(iPartition) != null)) { 1127 // only include positions in sampledPositions[] 1128 StringBuffer sampledData = new StringBuffer(); 1129 TreeSet sampPos = (TreeSet) sampledPositions.get(iPartition); 1130 for (Iterator i = sampPos.iterator(); i.hasNext();) { 1131 sampledData.append(data.charAt(((Integer) i.next()).intValue())); 1132 } 1133 data = sampledData.toString(); 1134 } 1135 bWriter.write(" " + String.valueOf(row.get(0)) + " " + data + "\n"); 1136 } 1137 } 1138 1139 // put in a space between partitions 1140 bWriter.write("\n"); 1141 } 1142 bWriter.write(";\nEND;\n\n"); 1143 } // done printing CRIMSON block 1144 1145 if (partitions.size() > 1) { 1146 bWriter.write("BEGIN SETS;\n"); 1147 bWriter.write(" CharPartition partition = \n"); 1148 bWriter.write(charPartition.toString()); 1149 bWriter.write("END;\n"); 1150 } 1151 } else { 1152 // the taxa are currently being loaded from the DATA 1153 // block. if there is no DATA block then we need to 1154 // add the TAXA block so that the TREES block will bel 1155 // properly loaded. We currently only include the 1156 // TAXA block if no DATA block, because we can't 1157 // easily control the order of the species in the DATA 1158 // block and this order needs to be the same in the 1159 // DATA and TAXA blocks. 1160 1161 // insert TAXA block 1162 bWriter.write("BEGIN TAXA;\n"); 1163 bWriter.write(" DIMENSIONS NTAX=" + ntax + ";\n"); 1164 bWriter.write(" TAXLABELS\n"); 1165 for (int iSpecies = 0; iSpecies < ntax; iSpecies++) { 1166 bWriter.write(" " + ((Species) species.get(iSpecies)).getID() + "\n"); 1167 } 1168 bWriter.write(";\nEND;\n\n"); 1169 } 1170 1171 switch (incTree) { 1172 case 1: 1173 // insert TREES block. This is placed after the data 1174 // so that the taxa can be loaded from the DATA block. 1175 bWriter.write("BEGIN TREES;\n"); 1176 // make sure queryID is properly quoted 1177 bWriter.write(" TREE " + NexusFile.quote(queryID) + " = " + treeNew.toNewick(incSequence) + ";\n"); 1178 bWriter.write("END;\n\n"); 1179 break; 1180 case 2: 1181 FileWriter fWriter_t = new FileWriter(file + ".tree"); 1182 BufferedWriter bWriter_t = new BufferedWriter(fWriter_t); 1183 // write newick to NEXUS file 1184 bWriter_t.write("#NEXUS;\n\n"); 1185 bWriter_t.write("BEGIN TREES;\n"); 1186 // make sure queryID is properly quoted 1187 bWriter_t.write(" TREE " + NexusFile.quote(queryID) + " = " + treeNew.toNewick(incSequence) + ";\n"); 1188 bWriter_t.write("END;\n\n"); 1189 bWriter_t.flush(); 1190 bWriter_t.close(); 1191 fWriter_t.close(); 1192 break; 1193 case 3: 1194 case 4: 1195 fWriter_t = new FileWriter(file + ".newick"); 1196 bWriter_t = new BufferedWriter(fWriter_t); 1197 // output newick rooted 1198 if (incTree == 3) bWriter_t.write(treeNew.toNewick(incSequence, true) + ";\n"); 1199 // output newick unrooted 1200 else bWriter_t.write(treeNew.toNewick(incSequence, false) + ";\n"); 1201 bWriter_t.flush(); 1202 bWriter_t.close(); 1203 fWriter_t.close(); 1204 break; 1205 } // done printing TREES block 1206 1207 if (incNotes) { 1208 // insert Query.notes. It is assumed that the notes 1209 // are properly formatted for the NEXUS file. 1210 bWriter.write(query.getNotes() + "\n"); 1211 } // done printing TREES block 1212 1213 bWriter.flush(); 1214 bWriter.close(); 1215 fWriter.close(); 1216 } catch (SQLException e) { 1217 CrimsonUtils.printError("SQL error processing CLOB."); 1218 CrimsonUtils.printError(e.getMessage()); 1219 return null; 1220 } catch (FileNotFoundException e) { // problem with FileOutputStream 1221 CrimsonUtils.printError("File \"" + filename + "\" can not be opened."); 1222 CrimsonUtils.printError(e.getMessage()); 1223 return null; 1224 } catch (IOException e) { // problem with ObjectOutputStream. 1225 // XXX do we need to close bWriter()? 1226 CrimsonUtils.printError("Error writting output file \"" + filename + "\"."); 1227 CrimsonUtils.printError(e.getMessage()); 1228 return null; 1229 } 1230 1231 CrimsonUtils.printMsg("Query finished."); 1232 1233 return treeNew; 1234 } 1235 1236 /** 1237 * This will perform a tree query, generating a phylip formatted 1238 * file. 1239 * @param queryID string ID for query to be run 1240 * @param filename filename for NEXUS output 1241 * @param incTree There are multiple options dictated by incTree: 0 = no 1242 * tree will be output; 1 = no tree will be output; 2 = 1243 * tree ouptut to second NEXUS file ("*.tree"); 3 = rooted tree 1244 * ouptut to a newick file ("*.newick"); 4 = unrooted tree ouptut 1245 * to a newick file ("*.newick") 1246 * @return The found tree object is returned (null when errors). 1247 */ 1248 public static Tree runPhylip(String queryID, String filename, int incTree) { 1249 if (! Database.isOpen()) { 1250 CrimsonUtils.printError("Must open a database before running a query."); 1251 return null; 1252 } 1253 1254 if (CrimsonUtils.isEmpty(filename)) { 1255 CrimsonUtils.printError("No PHYLIP file specified."); 1256 return null; 1257 } 1258 File file = new File(filename); 1259 1260 Query query = ObjectHandles.getQuery(queryID); 1261 if (query == null) { 1262 CrimsonUtils.printError("Query not found."); 1263 return null; 1264 } 1265 1266 if (CrimsonUtils.isEmpty(query.getTreeID())) { 1267 CrimsonUtils.printError("Tree not specified within query."); 1268 return null; 1269 } 1270 1271 Tree treeOrig = ObjectHandles.getTree(query.getTreeID()); 1272 if (treeOrig == null) { 1273 CrimsonUtils.printError("Tree does not exist: " + query.getTreeID()); 1274 return null; 1275 } 1276 1277 // set seed prior to sampling tree 1278 if (query.getSeed() > -1) CrimsonUtils.setRandomSeed(query.getSeed()); 1279 1280 // use Tree() routines to process the leaf selection. With 1281 // the new tree we can get the new newick string and species. 1282 Tree treeNew; 1283 switch (query.getLeafSelection()) { 1284 case 1: // random (numLeaves) 1285 treeNew = treeOrig.randomSample(query.getNumLeaves()); 1286 if (treeNew == null) return null; 1287 treeNew.computeStats(); 1288 break; 1289 case 2: // depth distributed (numLeaves, tempDepthThresh) 1290 treeNew = treeOrig.uniformSampleByTempDepth(query.getTempDepthThresh(), query.getNumLeaves()); 1291 if (treeNew == null) return null; 1292 treeNew.computeStats(); 1293 break; 1294 case 3: // depth weighted (numLeaves, tempDepthThresh) 1295 treeNew = treeOrig.weightedSampleByTempDepth(query.getTempDepthThresh(), query.getNumLeaves()); 1296 if (treeNew == null) return null; 1297 treeNew.computeStats(); 1298 break; 1299 case 4: // level distributed (numLeaves, levelThresh) 1300 treeNew = treeOrig.uniformSampleByLevel(query.getLevelThresh(), query.getNumLeaves()); 1301 if (treeNew == null) return null; 1302 treeNew.computeStats(); 1303 break; 1304 case 5: // level weighted (numLeaves, levelThresh) 1305 treeNew = treeOrig.weightedSampleByLevel(query.getLevelThresh(), query.getNumLeaves()); 1306 if (treeNew == null) return null; 1307 treeNew.computeStats(); 1308 break; 1309 case 6: // manual (leaves) 1310 treeNew = treeOrig.manualSampleByID(new ArrayList(query.getLeaves())); 1311 if (treeNew == null) return null; 1312 treeNew.computeStats(); 1313 break; 1314 default: 1315 // this is equivalent to "case 0". if tree not already 1316 // built, then build it now 1317 if (! treeOrig.isBuilt()) treeOrig.computeStats(); 1318 treeNew = treeOrig; 1319 } 1320 1321 // make sure the tree is valid 1322 if (treeNew.getNumSpecies() < 1) { 1323 CrimsonUtils.printError("Sampled tree has no leaves."); 1324 return null; 1325 } 1326 1327 String nchar; 1328 ArrayList partitions; // this is an array to preserve order 1329 1330 // used by seq selection methods 1-4, this will contain a list 1331 // of bp positions to be included in the output. these are 1332 // arrays so that they can differ per partition, in case 1333 // partitions very in size. 1334 ArrayList sampledPositions = new ArrayList(); 1335 1336 // this will contain the partition data printed at the end of 1337 // multi-partition PHYLIP files 1338 StringBuffer charPartition = new StringBuffer(); 1339 1340 ///////////// 1341 // get the partitions to be included and the values for nchar, 1342 // sampledPositions, and CharPartition 1343 if ((query.getSequenceSelection() == 5) || (query.isOnlyStruct())) { 1344 // only structure, so no partitions to be included 1345 partitions = new ArrayList(); 1346 nchar = "0"; 1347 } else { 1348 // figure out partition data to include 1349 partitions = new ArrayList(query.getPartitions()); 1350 if ((partitions == null) || (partitions.size() == 0)) { 1351 // no partitions specificed, so assume 'ALL'. 1352 partitions = Database.sqlToArray("SELECT id FROM PARTITIONS WHERE tree_id = '" + treeOrig.getID() + "'"); 1353 if (partitions == null) partitions = new ArrayList(); 1354 } 1355 1356 TreeSet expandedPositions = null; 1357 if ((query.getSequenceSelection() == 3) || (query.getSequenceSelection() == 4)) { 1358 // for manual selections, need to expand position 1359 // string to figure. this is independent of the 1360 // partition 1361 expandedPositions = expandPositions(query.getPositions(), (query.getSequenceSelection() == 3)); 1362 // if not able to expand then stop 1363 if (expandedPositions == null) return null; 1364 } 1365 1366 // used to count num positions across partitions 1367 int iLength = 0; 1368 1369 // set up the array such that each partition will have 1370 // it's own set of sampled positions 1371 sampledPositions = new ArrayList(); 1372 1373 // make sure all partitions are valid and compute relevant 1374 // sampling positions 1375 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 1376 String partitionID = (String) partitions.get(iPartition); 1377 1378 // shouldn't need to test both database and partitionPool, but 1379 // it's better to be careful 1380 if ((! Partitions.dbContains(partitionID)) || (! ObjectHandles.containsPartition(partitionID))) { 1381 CrimsonUtils.printError("Partition doesn't exists: " + partitionID); 1382 return null; 1383 } 1384 1385 // get partition length 1386 int pLength = (ObjectHandles.getPartition(partitionID)).getLength(); 1387 if (pLength <= 0) { 1388 CrimsonUtils.printError("Partition (" + partitionID + ") has invalid length: " + pLength); 1389 return null; 1390 } 1391 1392 if ((query.getSequenceSelection() == 3) || (query.getSequenceSelection() == 4)) { 1393 // manual selection 1394 if (partitions.size() > 1) { 1395 // manual selection is only valid for a single 1396 // data partition. 1397 CrimsonUtils.printError("Can't manually select positions when using more than one data partition."); 1398 return null; 1399 } 1400 1401 // add valid positions to sampledPositions. Do 1402 // this for every partition since partitions will 1403 // have different lengths. Thus a valid position 1404 // for one partition may be invalid for 1405 // another. HOWEVER, only allow one partition to 1406 // be selected, so doing this here should not be 1407 // necessary. 1408 TreeSet sampPos = new TreeSet(); 1409 for (Iterator iExpPos = expandedPositions.iterator(); iExpPos.hasNext();) { 1410 Integer indx = (Integer) iExpPos.next(); 1411 if (indx.intValue() >= pLength) { 1412 CrimsonUtils.printWarning("Ignoring out of bounds position:" + indx); 1413 } else { 1414 sampPos.add(indx); 1415 } 1416 } 1417 1418 // with manual selection, nchar is number of positions 1419 iLength = sampPos.size(); 1420 1421 sampledPositions.add(sampPos); 1422 1423 } else { 1424 // compute random positions to be sampled. We do 1425 // this for each partition because the partition 1426 // lengths may be different. 1427 if (query.getSequenceSelection() == 1) { // random codon sampling 1428 sampledPositions.add(getRandomPositions(pLength, query.getNumPositions(), true)); 1429 } else if (query.getSequenceSelection() == 2) { // random bp sampling 1430 sampledPositions.add(getRandomPositions(pLength, query.getNumPositions(), false)); 1431 } 1432 1433 // this is the case for seq selection 1 and 2 1434 int numPositions = query.getNumPositions(); 1435 // seq selection 0 is select all 1436 if (query.getSequenceSelection() == 0) numPositions = pLength; 1437 1438 // count num of positions to include per partition 1439 if (pLength < numPositions) { 1440 // warn user that partition length is smaller 1441 // than requested number of positions 1442 String msg = "Partition '" + partitionID + "' does not contain '"; 1443 msg += numPositions + "' positions as requested."; 1444 CrimsonUtils.printWarning(msg); 1445 1446 // setup charPartition string for printing at 1447 // end of NEXUS file 1448 charPartition.append(" " + NexusFile.quote(partitionID) + ":"); 1449 charPartition.append((iLength + 1) + "-" + (iLength + pLength) + "\n"); 1450 charPartition.append((iPartition + 1 < partitions.size()) ? "," : ";"); 1451 1452 iLength += pLength; 1453 } else { 1454 // setup charPartition string for printing at 1455 // end of NEXUS file 1456 charPartition.append(" " + NexusFile.quote(partitionID) + ":"); 1457 charPartition.append((iLength + 1) + "-" + (iLength + numPositions) + "\n"); 1458 charPartition.append((iPartition + 1 < partitions.size()) ? "," : ";"); 1459 1460 iLength += numPositions; 1461 } 1462 } 1463 } 1464 1465 // convert integer nchar value to string which will be 1466 // used below 1467 nchar = String.valueOf(iLength); 1468 } 1469 1470 //////////////////////////////// 1471 // begin writting the PHYLIP file 1472 //////////////////////////////// 1473 try { 1474 FileWriter fWriter = new FileWriter(file); 1475 BufferedWriter bWriter = new BufferedWriter(fWriter); 1476 1477 ArrayList species = treeNew.getLeaves(); 1478 int ntax = species.size(); 1479 String tmpS = ""; 1480 int padL = 0; 1481 1482 // if no partitions then no data 1483 if (partitions.size() > 0) { 1484 // print character data header 1485 bWriter.write(ntax + " " + nchar + "\n"); 1486 1487 // process sequence data 1488 for (int iPartition = 0; iPartition < partitions.size(); iPartition++) { 1489 String partitionID = (String) partitions.get(iPartition); 1490 1491 // process species in chunks of size SPECIES_CHUNK 1492 for (int iSpecies = 0; iSpecies < ntax; iSpecies += SPECIES_CHUNK) { 1493 int endChunk = iSpecies + SPECIES_CHUNK; 1494 if (endChunk > ntax) endChunk = ntax; 1495 1496 // build query 1497 String sql = "SELECT species_id, sequence FROM PART_DATA WHERE partition_id = '" + partitionID + "'"; 1498 sql += " and (species_id = '" + ((Species) species.get(iSpecies)).getID() + "'"; 1499 for (int iChunk = iSpecies + 1; iChunk < endChunk; iChunk++) { 1500 sql += " or species_id = '" + ((Species) species.get(iChunk)).getID() + "'"; 1501 } 1502 sql += ")"; 1503 1504 ArrayList results = Database.sqlToArray(sql); 1505 1506 if (results == null) { 1507 CrimsonUtils.printError("PHYLIP file incomplete. Unable to get partition data: " + sql); 1508 bWriter.flush(); 1509 bWriter.close(); 1510 fWriter.close(); 1511 return null; 1512 } 1513 1514 for (Iterator rows = results.iterator(); rows.hasNext();) { 1515 ArrayList row = (ArrayList) rows.next(); 1516 String data = Database.readClob(row.get(1)); 1517 1518 if ((sampledPositions.size() > 0) && (sampledPositions.get(iPartition) != null)) { 1519 // only include positions in sampledPositions[] 1520 StringBuffer sampledData = new StringBuffer(); 1521 TreeSet sampPos = (TreeSet) sampledPositions.get(iPartition); 1522 for (Iterator i = sampPos.iterator(); i.hasNext();) { 1523 sampledData.append(data.charAt(((Integer) i.next()).intValue())); 1524 } 1525 data = sampledData.toString(); 1526 } 1527 1528 tmpS = String.valueOf(row.get(0)); 1529 if (tmpS.length() > 10) { 1530 // can't have species labels larger than 10 char 1531 CrimsonUtils.printError("Species label (" + row.get(0) + ") to large for PHYLIP format."); 1532 return null; 1533 } 1534 padL = 10 - tmpS.length(); 1535 for (int i = 0; i < padL; i++) tmpS += ' '; 1536 bWriter.write(tmpS + data + "\n"); 1537 } 1538 } 1539 1540 // put in a space between partitions 1541 bWriter.write("\n"); 1542 } 1543 1544 } 1545 1546 switch (incTree) { 1547 case 2: 1548 FileWriter fWriter_t = new FileWriter(file + ".tree"); 1549 BufferedWriter bWriter_t = new BufferedWriter(fWriter_t); 1550 // write newick to NEXUS file 1551 bWriter_t.write("#NEXUS;\n\n"); 1552 bWriter_t.write("BEGIN TREES;\n"); 1553 // make sure queryID is properly quoted 1554 bWriter_t.write(" TREE " + NexusFile.quote(queryID) + " = " + treeNew.toNewick(false) + ";\n"); 1555 bWriter_t.write("END;\n\n"); 1556 bWriter_t.flush(); 1557 bWriter_t.close(); 1558 fWriter_t.close(); 1559 break; 1560 case 3: 1561 case 4: 1562 fWriter_t = new FileWriter(file + ".newick"); 1563 bWriter_t = new BufferedWriter(fWriter_t); 1564 // output newick rooted 1565 if (incTree == 3) bWriter_t.write(treeNew.toNewick(false, true) + ";\n"); 1566 // output newick unrooted 1567 else bWriter_t.write(treeNew.toNewick(false, false) + ";\n"); 1568 bWriter_t.flush(); 1569 bWriter_t.close(); 1570 fWriter_t.close(); 1571 break; 1572 } // done printing TREES block 1573 1574 bWriter.flush(); 1575 bWriter.close(); 1576 fWriter.close(); 1577 } catch (SQLException e) { 1578 CrimsonUtils.printError("SQL error processing CLOB."); 1579 CrimsonUtils.printError(e.getMessage()); 1580 return null; 1581 } catch (FileNotFoundException e) { // problem with FileOutputStream 1582 CrimsonUtils.printError("File \"" + filename + "\" can not be opened."); 1583 CrimsonUtils.printError(e.getMessage()); 1584 return null; 1585 } catch (IOException e) { // problem with ObjectOutputStream. 1586 // XXX do we need to close bWriter()? 1587 CrimsonUtils.printError("Error writting output file \"" + filename + "\"."); 1588 CrimsonUtils.printError(e.getMessage()); 1589 return null; 1590 } 1591 1592 CrimsonUtils.printMsg("Query finished."); 1593 1594 return treeNew; 1595 } 1596 1597 private static TreeSet getRandomPositions(int partitionLength, int numSamples, boolean isCodons) { 1598 // sampledPositions goes from 0 to ParititionLength - 1 1599 TreeSet sampledPositions = new TreeSet(); 1600 1601 // used to store the positions selected so that we can print 1602 // them out for debugging 1603 TreeSet posSelected = new TreeSet(); 1604 1605 // set up sampledPositions for sequence selection 1606 // methods 1 and 2 1607 if (isCodons) { // random codon sampling 1608 // if more samples than in the partition, then return null 1609 // which will cause all positions to be included 1610 if ((numSamples * 3) >= partitionLength) return null; 1611 1612 int numCodons = partitionLength / 3; 1613 1614 // flag if last codon isn't a complete codon 1615 int remainder = partitionLength % 3; 1616 if (remainder > 0) numCodons++; 1617 1618 // we need to select random positions, 1619 // non-repetitively. to make this simple, we use an array 1620 // and just remove items as they are selected. 1621 ArrayList tmpList = new ArrayList(numCodons); 1622 for (int i = 0; i < numCodons; i++) { tmpList.add(new Integer(i)); } 1623 1624 int iEnd = (numSamples * 3); 1625 while (sampledPositions.size() < iEnd) { 1626 int iRand = CrimsonUtils.random.nextInt(tmpList.size()); 1627 int value = ((Integer) tmpList.get(iRand)).intValue(); 1628 1629 if (DEBUG_OUTPUT) posSelected.add(new Integer(value + 1)); 1630 1631 // test is selected last (partial codon) 1632 if ((remainder > 0) && (value == numCodons - 1)) { 1633 sampledPositions.add(new Integer(value * 3)); 1634 1635 if (remainder == 2) { 1636 sampledPositions.add(new Integer(value * 3 + 1)); 1637 iEnd -= 1; // 1 fewer position to sample 1638 } else { 1639 iEnd -= 2; // 2 fewer positions to sample 1640 } 1641 1642 remainder = 0; // no longer a remainder 1643 } else { 1644 sampledPositions.add(new Integer(value * 3)); 1645 sampledPositions.add(new Integer(value * 3 + 1)); 1646 sampledPositions.add(new Integer(value * 3 + 2)); 1647 } 1648 1649 tmpList.remove(iRand); 1650 } 1651 1652 } else { // random sampling 1653 // if more samples than in the partition, then return null 1654 // which will cause all positions to be included 1655 if (numSamples >= partitionLength) return null; 1656 1657 // we need to select random positions, 1658 // non-repetitively. to make this simple, we use an array 1659 // and just remove items as they are selected. 1660 ArrayList tmpList = new ArrayList(partitionLength); 1661 for (int i = 0; i < partitionLength; i++) { tmpList.add(new Integer(i)); } 1662 1663 for (int i = 0; i < numSamples; i++) { 1664 int iRand = CrimsonUtils.random.nextInt(tmpList.size()); 1665 int value = ((Integer) tmpList.get(iRand)).intValue(); 1666 1667 if (DEBUG_OUTPUT) posSelected.add(new Integer(value + 1)); 1668 1669 // sampled positions go from 0 to N-1 1670 sampledPositions.add(new Integer(value)); 1671 1672 tmpList.remove(iRand); 1673 } 1674 } 1675 1676 // print the positions to the output file 'positions.list' for 1677 // debugging 1678 if (DEBUG_OUTPUT) FileIO.printPositions(posSelected); 1679 1680 return sampledPositions; 1681 } 1682 1683 /** 1684 * This will convert a set of bp ranges to bp positions. If the 1685 * set is codons, the underlying bp positions for the codons we be 1686 * used. Note that the user enters positions going from (1 to N) 1687 * but when we access the data we go from (0 to N-1). So we shift 1688 * the user specified positions down by 1. 1689 */ 1690 public static TreeSet expandPositions(HashSet positionSet, boolean isCodons) { 1691 if ((positionSet == null) || (positionSet.size() == 0)) { 1692 CrimsonUtils.printError("Empty position list."); 1693 return null; 1694 } 1695 1696 TreeSet sampledPositions = new TreeSet(); 1697 1698 // used for debugging 1699 TreeSet posSelected = new TreeSet(); 1700 1701 for (Iterator i = positionSet.iterator(); i.hasNext();) { 1702 String position = (String) i.next(); 1703 1704 if (position.contains("-")) { 1705 String[] range = position.split("-"); 1706 1707 if (range.length != 2) { // make sure range is valid 1708 CrimsonUtils.printError("Incorrect range in position list: " + position); 1709 return null; 1710 } 1711 1712 int iStart; 1713 int iEnd; 1714 if (isCodons) { 1715 // the range goes from start to end, inclusively 1716 iStart = (Integer.parseInt(range[0]) - 1) * 3; 1717 iEnd = (Integer.parseInt(range[1]) * 3); 1718 1719 // this is added here to simplify things because 1720 // we want the codon start positions, not the 1721 // positions of each bp within the codons - which 1722 // is what is added to the 'postitions' array 1723 if (DEBUG_OUTPUT) { 1724 int tmpI = Integer.parseInt(range[1]); 1725 for (int cnt = Integer.parseInt(range[0]); cnt < tmpI; cnt++) { 1726 posSelected.add(new Integer(cnt)); 1727 } 1728 } 1729 } else { 1730 // the range goes from start to end, inclusively 1731 iStart = Integer.parseInt(range[0]) - 1; 1732 iEnd = Integer.parseInt(range[1]); 1733 1734 if (DEBUG_OUTPUT) { 1735 int tmpI = Integer.parseInt(range[1]); 1736 for (int cnt = Integer.parseInt(range[0]); cnt < tmpI; cnt++) { 1737 posSelected.add(new Integer(cnt)); 1738 } 1739 } 1740 } 1741 1742 if (iStart < 0) { // make sure range is valid 1743 CrimsonUtils.printError("Invalid position list. Positions start at 1."); 1744 return null; 1745 } 1746 1747 for (int cnt = iStart; cnt < iEnd; cnt++) { 1748 sampledPositions.add(new Integer(cnt)); 1749 } 1750 } else { 1751 if (DEBUG_OUTPUT) posSelected.add(new Integer(position)); 1752 1753 if (isCodons) { 1754 int iEnd = (Integer.parseInt(position) * 3) - 1; 1755 for (int cnt = iEnd - 2; cnt <= iEnd; cnt++) { 1756 sampledPositions.add(new Integer(cnt)); 1757 } 1758 } else { 1759 sampledPositions.add(new Integer(Integer.parseInt(position) - 1)); 1760 } 1761 } 1762 } 1763 1764 // print the positions to the output file 'positions.list' for 1765 // debugging 1766 if (DEBUG_OUTPUT) FileIO.printPositions(posSelected); 1767 1768 return sampledPositions; 1769 } 1770 1771 /** 1772 * Returns the non-Clob columns in the QUERIES table. Each table 1773 * record will be separated by a '\n' in the output. 1774 */ 1775 public String toString() { 1776 if (! Database.isOpen()) { 1777 CrimsonUtils.printError("Can list queries, no open database."); 1778 return ""; 1779 } 1780 1781 // SELECT id, leaf_select, num_leaves, temp_depth_thresh, level_thresh, seq_select, num_pos, seed, notes FROM queries; 1782 String sql = "SELECT ID, LEAF_SELECT, NUM_LEAVES, TEMP_DEPTH_THRESH, LEVEL_THRESH, SEQ_SELECT, NUM_POS, SEED, NOTES FROM QUERIES"; 1783 ArrayList results = Database.sqlToArray(sql); 1784 1785 if (results == null) { 1786 CrimsonUtils.printError("Error printing QUERIES table."); 1787 return ""; 1788 } 1789 1790 StringBuffer out = new StringBuffer(""); 1791 out.append("ID \t LEAF_SELECT \t NUM_LEAVES \t TEMP_DEPTH_THRESH \t LEVEL_THRESH \t SEQ_SELECT \t NUM_POS \t SEED \t NOTES\n"); 1792 for (Iterator rows = results.iterator(); rows.hasNext();) { 1793 ArrayList row = (ArrayList) rows.next(); 1794 for (int i = 0; i < 7; i++) out.append(String.valueOf(row.get(i)) + " \t "); 1795 out.append(String.valueOf(row.get(7)) + "\n"); 1796 } 1797 1798 return out.toString().trim(); 1799 } 1800 1801 } // Queries.java