001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)FASTATrack.java 021 */ 022 023 package edu.upenn.gloDB.io; 024 025 import edu.upenn.gloDB.*; 026 import edu.upenn.gloDB.gui.GUIUtils; 027 import java.io.*; 028 import java.util.HashMap; 029 import java.util.Set; 030 import java.util.Iterator; 031 import java.text.NumberFormat; 032 import javax.swing.filechooser.FileFilter; 033 034 /** 035 * Import Track data from a FASTA file. The basic file format 036 * dictates a header line at the beginning of each Feature. There 037 * are no standards as to what the header line should contain or how 038 * it should be formatted, other than to stipulate that it begins with 039 * a ">". Thus this format sufficient for coding Sequence objects but 040 * not ideal for sequence annotations (Tracks). Since some sites, 041 * such as www.fruitfly.org, release annotations as FASTA files, some 042 * attempt has been made to parse the headers from specific sites. 043 * Users can use the FASTAParser interface to create their own header 044 * parsers as well. Here the default is FASTAParserFly. 045 * 046 * @XXX Can we assume that the header starts with a Sequence ID? 047 * 048 * @XXX SaveTrack() looks for 'ID', 'descriptors', 'dbxref', 049 * 'strand', 'source', and 'boundaries' in the Feature attributes and 050 * processes these uniquely. In particular, 'boundaries' is discarded 051 * because it's assumed to be the same as Feature.start and 052 * Feature.stop. If 'source' is also discarded if it's the same as 053 * Feature.getSource().getID(). 'strand' is used in creating 054 * 'gene_boundaries' and similarly discarded. the 'descriptors' and 055 * 'dbxref' labels are not included in the output, but their HashMap 056 * values are included. 057 * 058 * @author Stephen Fisher 059 * @version $Id: FASTATrack.java,v 1.1.2.21 2007/03/01 21:17:33 fisher Exp $ 060 */ 061 062 public class FASTATrack implements TrackFile { 063 064 private final int ID = FileIO.FASTA; 065 private final String DESC = "FASTA files (*.fa; *.fas; *.fasta)"; 066 private final String[] EXT = {".fa", ".fas", ".fasta"}; 067 private final FileFilter fileFilter = new FASTAFilter(); 068 069 //-------------------------------------------------------------------------- 070 // Setters and Getters 071 072 public int getID() { return ID; } 073 074 public String getDesc() { return DESC; } 075 076 public String[] getExt() { return EXT; } 077 078 public FileFilter getFileFilter() { return fileFilter; } 079 080 //-------------------------------------------------------------------------- 081 // Miscellaneous Methods 082 083 /** 084 * Load all Features in the FASTA file into a single Track and 085 * return the resulting Track object. If possible, a Sequence 086 * object will be loaded/created for each Feature from the FASTA 087 * file. 088 */ 089 public Track load(String filename) { 090 return load(filename, ""); 091 } 092 093 /** 094 * Load all Features in the FASTA file into a single Track and 095 * return the resulting Track object. If a Sequence is given, 096 * then that will be used as the source file all Features in the 097 * file, otherwise a Sequence object will be loaded/created for 098 * each Feature from the FASTA file. 099 * 100 * The header is parsed using {@link FASTAParserFly 101 * FASTAParserFly}. An {@link ExactFeature ExactFeature} object 102 * is created with the start and stop positions taken from the 103 * "boundaries" key:value pair. The parsed header is stored in 104 * the {@link AbstractFeature#attributes 105 * AbstractFeature.attributes} field of the {@link ExactFeature 106 * ExactFeature} object. 107 * 108 * If the file is empty then returns 'null'. 109 * 110 * If this can't get a valid Sequence ID from the user or the 111 * Feature's header, then can't be associated with any existing 112 * Sequence and so this will create a Sequence with it's best 113 * guess at the Sequence ID. However, this isn't very useful 114 * because it's not likely that other Features will share this 115 * Sequence. There's also no capacity to load this Sequence data 116 * later, so the Sequence data is load here as well, which is very 117 * inefficient. 118 * 119 * @XXX When skipping a Feature because the Sequence data loaded 120 * doesn't contain the correct range, should we discard the loaded 121 * Sequence or leave it in the sequencePool? 122 * @XXX I'm not sure how the position information is formatted. 123 * @XXX Need to throw FileIO exceptions, rather than just print 124 * errors. 125 */ 126 public Track load(String filename, String sourceID) { 127 // when creating Track's ID, remove ".fasta" or ".fas" or 128 // ".fa" filename extension, if present. Use File() to remove 129 // any path info from the filename so that the ID is just the 130 // name. 131 File file = new File(filename); 132 String id = file.getName(); 133 if (id.endsWith(".fasta")) id = id.substring(0, id.length()-6); 134 else if (id.endsWith(".fas")) id = id.substring(0, id.length()-4); 135 else if (id.endsWith(".fa")) id = id.substring(0, id.length()-3); 136 137 Track track = new Track(false, id); 138 Sequence source = null; 139 if (! GloDBUtils.isEmpty(sourceID)) { 140 source = ObjectHandles.getSequence(sourceID); 141 if (source == null) { 142 String msg = "The source ID \"" + sourceID + "\" isn't valid."; 143 msg += "Source IDs will be set from the feature headers."; 144 GloDBUtils.printMsg(msg, GloDBUtils.WARNING); 145 } 146 } 147 148 try { 149 BufferedReader bReader = new BufferedReader(new FileReader(file)); 150 String line; 151 152 boolean validFeature = false; 153 boolean loadSequenceData = false; 154 155 Feature cFeature = null; // current Feature 156 Sequence seq = null; 157 String data = ""; // store sequence data as read from file 158 159 while ((line = bReader.readLine()) != null) { 160 // skip empty lines 161 if (GloDBUtils.isEmpty(line)) continue; 162 163 // test for Sequence header 164 if (line.startsWith(">")) { 165 if (validFeature) { 166 if (loadSequenceData) { 167 // we need to save the Sequence/contig data 168 seq.setData(data); 169 170 // reset the Sequence flag because starting to 171 // read in a new Feature 172 loadSequenceData = false; 173 174 // make sure that the Feature is valid; that is, 175 // if the sequence loaded isn't large enough to 176 // encompass the Feature we won't include the 177 // Feature 178 if (seq.contains(cFeature)) { 179 // valid Feature, so add to Track 180 track.addFeature(cFeature); 181 } else { 182 String msg = "Skipping record because the sequence loaded (" + seq.getID() + ") doesn't encompass the entire feature: \n"; 183 msg += cFeature.getAttributes(); 184 GloDBUtils.printError(msg); 185 } 186 } else { 187 // not first Feature so add existing 188 // Feature before reseting the variables 189 // for the next Feature. We can assume 190 // this is a valid Feature. 191 track.addFeature(cFeature); 192 } 193 } 194 195 // we assume this will be a valid Feature 196 validFeature = true; 197 198 // assume all features are "exact", thus just have start and 199 // stop positions. 200 int start = 0; 201 int stop = 0; 202 HashMap attributes; 203 204 // reset Sequence info 205 seq = source; 206 data = ""; 207 208 // process source/boundaries (was call 209 // "gene_boundaries" in the FASTA file) here, 210 // creating a Feature. 211 212 // XXX: what is the format for the position 213 // information? can a track have more than one 214 // position pair? for example one pair looks like 215 // this: (X:1,488..3,280[-]) and can there be 216 // cases with more than one pair such as something 217 // like this: (X:1,488..3,280;4,453..6,654[-]) 218 FASTAParser parser = new FASTAParserFly(); 219 attributes = parser.parseHeader(line); 220 221 if (! attributes.containsKey("boundaries")) { 222 String msg = "Skipping record because no feature information in header: \n"; 223 msg += line; 224 GloDBUtils.printError(msg); 225 validFeature = false; // not valid Feature 226 continue; 227 } 228 String boundaries = (String) attributes.get("boundaries"); 229 230 // get the Sequence ID and start/stop positions 231 // String pos[] = boundaries.split(":", 2); 232 // sourceID = pos[0]; 233 // XXX: This assumes that these Features do NOT 234 // have more than one position pair. 235 // pos = pos[1].split("\\.\\."); 236 String pos[] = boundaries.split("\\.\\."); 237 238 // add position information to the Feature object. the 239 // positions have commas and thus need to be 'parsed' and not 240 // just converted from String to Integer. 241 NumberFormat nf = NumberFormat.getNumberInstance(); 242 try { 243 start = (nf.parse(pos[0])).intValue(); 244 stop = (nf.parse(pos[1])).intValue(); 245 } catch (Exception e) { 246 String msg = "Skipping record because unable to parse feature position information: \n"; 247 msg += line; 248 GloDBUtils.printError(msg); 249 validFeature = false; // not valid Feature 250 continue; 251 } 252 253 if (seq == null) { 254 // we don't have user-specified source info, 255 // so check the "gene_boundaries" Sequence ID 256 // to see if it's valid 257 if (attributes.containsKey("source")) { 258 sourceID = (String) attributes.get("source"); 259 seq = ObjectHandles.getSequence(sourceID); 260 } 261 262 if ((seq == null) || 263 (! seq.contains(start)) || (! seq.contains(stop))) { 264 // still haven't found a valid Sequence 265 // for this Feature, so check the ID at 266 // the beginning of the header line. The 267 // ID was parsed by FASTAParserFly and 268 // added as an attribute. 269 if (attributes.containsKey("ID")) { 270 sourceID = (String) attributes.get("ID"); 271 272 // we need to test if new source ID is valid 273 seq = ObjectHandles.getSequence(sourceID); 274 } 275 276 if ((seq == null) || 277 (! seq.contains(start)) || (! seq.contains(stop))) { 278 // XXX If still no source info, then need 279 // to load the source info from the file, 280 // creating a new Sequence object. If 281 // sourceID is empty, then a random ID 282 // will be created in Sequence(). 283 284 // If we've gotten this far then the 285 // Feature can't be associated with 286 // any existing Sequence and so we're 287 // creating a Sequence here. However, 288 // this isn't very useful because it's 289 // not likely that other Features will 290 // share this Sequence. There's also 291 // no capacity to load this Sequence 292 // data later, so we are going to have 293 // to load it now as well, which is 294 // very inefficient. 295 try { 296 GloDBUtils.printMsg("Source \"" + sourceID + "\" doesn't exist, loading sequence data."); 297 seq = new Sequence(sourceID); 298 seq.setAttributes(attributes); 299 } catch (InvalidIDException e) { 300 // This shoud never be reached but 301 // is here just in case something 302 // goes wrong above. 303 String newID = Sequence.randomID("_S"); 304 String msg = "Source \"" + sourceID + "\" already exists, using ID \"" 305 + newID + "\" instead."; 306 GloDBUtils.printMsg(msg, GloDBUtils.WARNING); 307 seq = new Sequence(newID); 308 seq.setAttributes(attributes); 309 } 310 311 // use the Feature's start position 312 // as the offset position for the 313 // Sequence data 314 315 // XXX we should shift the feature 316 // start/stop postions to go from 0 to 317 // (length-offset). 318 seq.setOffset(start); 319 320 // at this point we need to load the 321 // Sequence data from the input file. 322 loadSequenceData = true; 323 } 324 } 325 } 326 327 // starting a new Feature. 'Seq' is either set 328 // to 'sourceID', as provided when this method was 329 // called, or to the Sequence contig loaded from 330 // this FASTA file. 331 cFeature = new ExactFeature(start, stop, seq); 332 // add attributes to Feature object 333 cFeature.setAttributes(attributes); 334 } else { 335 // if necessary, load Sequence/contig data 336 if (loadSequenceData) data += line; 337 } 338 } 339 340 // add last Feature's info 341 if (validFeature && (cFeature != null)) { 342 if (loadSequenceData) { 343 // we need to save the Sequence/contig data 344 seq.setData(data); 345 346 // make sure that the Feature is valid; that is, 347 // if the sequence loaded isn't large enough to 348 // encompass the Feature we won't include the 349 // Feature 350 if (seq.contains(cFeature)) { 351 // valid Feature, so add to Track 352 track.addFeature(cFeature); 353 } else { 354 String msg = "Skipping record because the sequence loaded (" + seq.getID() + ") doesn't encompass the entire feature: \n"; 355 msg += cFeature.getAttributes(); 356 GloDBUtils.printError(msg); 357 } 358 } else { 359 // valid Feature, so add to Track 360 track.addFeature(cFeature); 361 } 362 } 363 364 bReader.close(); 365 } catch (FileNotFoundException e) { 366 GloDBUtils.printError("File not found: " + e.getMessage()); 367 return null; 368 } catch (IOException e) { 369 GloDBUtils.printError("Error reading file: " + filename); 370 return null; 371 } 372 373 if (track.numFeatures() == 0) { 374 // this assumes an empty Track is a mistake, so return null 375 GloDBUtils.printError("Unable to load any features from the file: " + filename); 376 return null; 377 } 378 379 // add track to trackPool 380 try { 381 ObjectHandles.addTrack(track); 382 } catch (InvalidIDException e) { 383 String id_new = Track.randomID("_T"); 384 String msg = "ID \"" + track.getID() + "\" already exists, using ID \"" + id_new + "\" instead."; 385 GloDBUtils.printWarning(msg); 386 387 // add self to set of all Tracks, using new ID 388 track.setID(id_new, false); 389 ObjectHandles.addTrack(track); 390 } 391 392 GloDBUtils.printMsg("Loaded FASTA file: " + filename); 393 return track; 394 } 395 396 /** 397 * Save the Track to a file based on it's ID. This will overwrite 398 * any existing file. This will append ".fasta" to the filename. 399 */ 400 public void save(String id) { 401 // add ".fasta" filename extension, if necessary 402 String filename = id; 403 if ((! filename.endsWith(".fa")) && (! filename.endsWith(".fas")) 404 && (! filename.endsWith(".fasta"))) { 405 filename += ".fasta"; 406 } 407 408 save(id, filename, true); 409 } 410 411 /** 412 * Save all Features in a FASTA file. 413 * 414 * @XXX need to throw FileIO exceptions, rather than just print 415 * errors. 416 * @XXX How should the attributes be formatted? Should we remove 417 * 'ID', 'descriptors', 'dbxref', 'strand', 'source', and 418 * 'boundaries' from the header since these were most likely added 419 * when we created the header? 420 */ 421 public void save(String id, String filename, boolean overwrite) { 422 // add ".fasta" filename extension, if necessary 423 if ((! filename.endsWith(".fa")) && (! filename.endsWith(".fas")) 424 && (! filename.endsWith(".fasta"))) { 425 filename += ".fasta"; 426 } 427 428 File file = new File(filename); 429 // if the file already exists and not supposed to overwrite 430 // it, then return on error. 431 if (file.exists() && (! overwrite)) { 432 GloDBUtils.printError("File \"" + filename + "\" already exists."); 433 return; 434 } 435 436 try { 437 Track track = ObjectHandles.getTrack(id); 438 if (track == null) { 439 GloDBUtils.printError("Track \"" + id + "\" doesn't exist."); 440 return; 441 } 442 443 FileWriter fWriter = new FileWriter(file); 444 BufferedWriter bWriter = new BufferedWriter(fWriter); 445 446 for (Iterator s = track.getSourceSet().iterator(); s.hasNext();) { 447 String sequenceID = (String) s.next(); 448 Sequence sequence = (Sequence) ObjectHandles.sequencePool.get(sequenceID); 449 450 // get sequence data for this source 451 String seqData = sequence.getData(); 452 int offset = sequence.getOffset(); 453 454 for (Iterator i = track.featuresBySource(sequenceID).iterator(); i.hasNext();) { 455 Feature feature = (Feature) i.next(); 456 457 // XXX should include more formatting 458 String header = ">"; 459 460 // create a copy of the attributes so we can remove 461 // objects from the HashMap as we process them below 462 HashMap attribs = feature.getAttributesMap(); 463 464 // start with the ID attribute, if not present, then 465 // use the Track's ID. 466 if (attribs.containsKey("ID")) { 467 header += attribs.get("ID"); 468 attribs.remove("ID"); 469 } else { 470 header += id; 471 } 472 473 // if contains 'descriptors' then remove label 474 if (attribs.containsKey("descriptors")) { 475 header += " " + attribs.get("descriptors"); 476 attribs.remove("descriptors"); 477 } 478 479 // if "gene_boundaries" already exists, then we 480 // probably didn't process this header and so we 481 // should just leave it alone 482 if (! attribs.containsKey("gene_boundaries")) { 483 String gb = "gene_boundaries:(" + feature.getSource().getID() + ":"; 484 gb += feature.getStart() + ".." + feature.getStop(); 485 486 // if 'source' already handled, then remove from 487 // attribs map 488 if (attribs.containsKey("source")) { 489 String value = (String) attribs.get("source"); 490 // if (value.equalsIgnoreCase(feature.getSource().getID())) { 491 if (value.equals(feature.getSource().getID())) { 492 attribs.remove("source"); 493 } 494 } 495 496 // if 'boundaries' exists then remove from attribs 497 // map, because this should be equivalent to the 498 // Feature's start/stop 499 attribs.remove("boundaries"); 500 501 if (attribs.containsKey("strand")) { 502 gb += "[" + (String) attribs.get("strand") + "]"; 503 attribs.remove("strand"); // don't need anymore 504 } 505 gb += ")"; 506 header += " " + gb; 507 } 508 509 // if contains 'dbxref' then remove label and enclose 510 // in '()' 511 if (attribs.containsKey("dbxref")) { 512 header += " ("; 513 Set dbxref = (Set) attribs.get("dbxref"); 514 for (Iterator dI = dbxref.iterator(); dI.hasNext();) { 515 header += dI.next(); 516 } 517 header += ")"; 518 attribs.remove("dbxref"); 519 } 520 521 // add remaining attributes to the header 522 for (Iterator l = (attribs.keySet()).iterator(); l.hasNext();) { 523 String key = (String) l.next(); 524 header += " " + key + ":" + attribs.get(key); 525 } 526 527 bWriter.write(header); 528 bWriter.newLine(); 529 530 // if offset = 0, then not set so need to adjust for 531 // sequence starting at 1 and String starting at 0. If 532 // offset is set, then 533 int start = feature.getStart(); 534 int stop = feature.getStop(); 535 if (offset == 0) { 536 start -= 1; 537 } else { 538 start = start - offset; 539 stop = (stop - offset) + 1; 540 } 541 542 // output to file with Sequence.FORMAT_WIDTH 543 // characters per line 544 String boundedData = seqData.substring(start, stop); 545 int dataLen = boundedData.length(); 546 int idx = Sequence.FORMAT_WIDTH; 547 while (idx < dataLen) { 548 bWriter.write(boundedData.substring(idx - Sequence.FORMAT_WIDTH, idx) + "\n"); 549 idx += Sequence.FORMAT_WIDTH; 550 } 551 if (idx >= dataLen) bWriter.write(boundedData.substring(idx - Sequence.FORMAT_WIDTH, 552 dataLen)); 553 bWriter.newLine(); 554 } 555 } 556 557 bWriter.newLine(); 558 bWriter.flush(); 559 bWriter.close(); 560 } catch (FileNotFoundException e) { 561 // problem with FileOutputStream 562 GloDBUtils.printError("File \"" + filename + "\" can not be opened."); 563 } catch (IOException e) { 564 // problem with ObjectOutputStream. XXX do we need to 565 // close 'oStream'? 566 GloDBUtils.printError("Error writting output file \"" + filename + "\"."); 567 } 568 } 569 570 /** 571 * FASTA specific FileFilter. 572 * @XXX This should use EXT. 573 */ 574 private class FASTAFilter extends FileFilter { 575 public boolean accept(File f) { 576 // accept directories 577 if (f.isDirectory()) return true; 578 579 // if true, then don't filter by file extensions. 580 if (GUIUtils.showAllFiles()) return true; 581 582 // accept files ending in '.fasta' or '.fas' or '.fa' 583 if ((f.getName()).endsWith(".fasta")) return true; 584 if ((f.getName()).endsWith(".fas")) return true; 585 if ((f.getName()).endsWith(".fa")) return true; 586 587 return false; 588 } 589 590 // set the filter's description 591 public String getDescription() { return DESC; } 592 } 593 594 } // FASTATrack.java 595 596