001    /*
002     * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of
003     * Pennsylvania.
004     *
005     * This file is part of Glo-DB.
006     * 
007     * Glo-DB is free software: you can redistribute it and/or modify it
008     * under the terms of the GNU General Public License as published by
009     * the Free Software Foundation, either version 3 of the License, or
010     * (at your option) any later version.
011     * 
012     * Glo-DB is distributed in the hope that it will be useful, but
013     * WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * General Public License for more details.
016     * 
017     * You should have received a copy of the GNU General Public License
018     * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>.
019     *
020     * @(#)FASTATrack.java
021     */
022    
023    package edu.upenn.gloDB.io;
024    
025    import edu.upenn.gloDB.*;
026    import edu.upenn.gloDB.gui.GUIUtils;
027    import java.io.*;
028    import java.util.HashMap;
029    import java.util.Set;
030    import java.util.Iterator;
031    import java.text.NumberFormat;
032    import javax.swing.filechooser.FileFilter;
033    
034    /**
035     * Import Track data from a FASTA file.  The basic file format
036     * dictates a header line at the beginning of each Feature.  There
037     * are no standards as to what the header line should contain or how
038     * it should be formatted, other than to stipulate that it begins with
039     * a ">".  Thus this format sufficient for coding Sequence objects but
040     * not ideal for sequence annotations (Tracks).  Since some sites,
041     * such as www.fruitfly.org, release annotations as FASTA files, some
042     * attempt has been made to parse the headers from specific sites.
043     * Users can use the FASTAParser interface to create their own header
044     * parsers as well.  Here the default is FASTAParserFly.
045     *
046     * @XXX Can we assume that the header starts with a Sequence ID?
047     *
048     * @XXX SaveTrack() looks for 'ID', 'descriptors', 'dbxref',
049     * 'strand', 'source', and 'boundaries' in the Feature attributes and
050     * processes these uniquely.  In particular, 'boundaries' is discarded
051     * because it's assumed to be the same as Feature.start and
052     * Feature.stop.  If 'source' is also discarded if it's the same as
053     * Feature.getSource().getID().  'strand' is used in creating
054     * 'gene_boundaries' and similarly discarded.  the 'descriptors' and
055     * 'dbxref' labels are not included in the output, but their HashMap
056     * values are included.
057     *
058     * @author  Stephen Fisher
059     * @version $Id: FASTATrack.java,v 1.1.2.21 2007/03/01 21:17:33 fisher Exp $
060     */
061    
062    public class FASTATrack implements TrackFile {
063    
064             private final int ID = FileIO.FASTA;
065             private final String DESC = "FASTA files (*.fa; *.fas; *.fasta)";
066             private final String[] EXT = {".fa", ".fas", ".fasta"};
067             private final FileFilter fileFilter = new FASTAFilter();
068    
069        //--------------------------------------------------------------------------
070        // Setters and Getters
071    
072             public int getID() { return ID; }
073    
074             public String getDesc() { return DESC; }
075    
076             public String[] getExt() { return EXT; }
077    
078             public FileFilter getFileFilter() { return fileFilter; }
079    
080        //--------------------------------------------------------------------------
081        // Miscellaneous Methods
082    
083             /**
084              * Load all Features in the FASTA file into a single Track and
085              * return the resulting Track object.  If possible, a Sequence
086              * object will be loaded/created for each Feature from the FASTA
087              * file.
088              */
089             public Track load(String filename) {
090                      return load(filename, "");
091             }
092    
093             /**
094              * Load all Features in the FASTA file into a single Track and
095              * return the resulting Track object.  If a Sequence is given,
096              * then that will be used as the source file all Features in the
097              * file, otherwise a Sequence object will be loaded/created for
098              * each Feature from the FASTA file.  
099              *
100              * The header is parsed using {@link FASTAParserFly
101              * FASTAParserFly}.  An {@link ExactFeature ExactFeature} object
102              * is created with the start and stop positions taken from the
103              * "boundaries" key:value pair.  The parsed header is stored in
104              * the {@link AbstractFeature#attributes
105              * AbstractFeature.attributes} field of the {@link ExactFeature
106              * ExactFeature} object.
107              *
108              * If the file is empty then returns 'null'.
109              *
110              * If this can't get a valid Sequence ID from the user or the
111              * Feature's header, then can't be associated with any existing
112              * Sequence and so this will create a Sequence with it's best
113              * guess at the Sequence ID.  However, this isn't very useful
114              * because it's not likely that other Features will share this
115              * Sequence.  There's also no capacity to load this Sequence data
116              * later, so the Sequence data is load here as well, which is very
117              * inefficient.
118              *
119              * @XXX When skipping a Feature because the Sequence data loaded
120              * doesn't contain the correct range, should we discard the loaded
121              * Sequence or leave it in the sequencePool?
122              * @XXX I'm not sure how the position information is formatted.
123              * @XXX Need to throw FileIO exceptions, rather than just print
124              * errors.
125              */
126             public Track load(String filename, String sourceID) {
127                      // when creating Track's ID, remove ".fasta" or ".fas" or
128                      // ".fa" filename extension, if present.  Use File() to remove
129                      // any path info from the filename so that the ID is just the
130                      // name.
131                      File file = new File(filename);
132                      String id = file.getName();
133                      if (id.endsWith(".fasta")) id = id.substring(0, id.length()-6);
134                      else if (id.endsWith(".fas")) id = id.substring(0, id.length()-4);
135                      else if (id.endsWith(".fa")) id = id.substring(0, id.length()-3);
136    
137                      Track track = new Track(false, id);
138                      Sequence source = null;
139                      if (! GloDBUtils.isEmpty(sourceID)) {
140                                    source = ObjectHandles.getSequence(sourceID);
141                                    if (source == null) {
142                                             String msg = "The source ID \"" + sourceID + "\" isn't valid.";
143                                             msg += "Source IDs will be set from the feature headers.";
144                                             GloDBUtils.printMsg(msg, GloDBUtils.WARNING);
145                                    }
146                      }
147    
148                      try {
149                                    BufferedReader bReader = new BufferedReader(new FileReader(file));
150                                    String line;
151    
152                                    boolean validFeature = false;
153                                    boolean loadSequenceData = false;
154    
155                                    Feature cFeature = null;  // current Feature
156                                    Sequence seq = null;
157                                    String data = "";  // store sequence data as read from file
158    
159                                    while ((line = bReader.readLine()) != null) {
160                                             // skip empty lines
161                                             if (GloDBUtils.isEmpty(line)) continue;
162    
163                                             // test for Sequence header
164                                             if (line.startsWith(">")) {
165                                                      if (validFeature) {
166                                                                    if (loadSequenceData) { 
167                                                                             // we need to save the Sequence/contig data
168                                                                             seq.setData(data);
169    
170                                                                             // reset the Sequence flag because starting to
171                                                                             // read in a new Feature
172                                                                             loadSequenceData = false;
173                                                      
174                                                                             // make sure that the Feature is valid; that is,
175                                                                             // if the sequence loaded isn't large enough to
176                                                                             // encompass the Feature we won't include the
177                                                                             // Feature
178                                                                             if (seq.contains(cFeature)) {
179                                                                                      // valid Feature, so add to Track
180                                                                                      track.addFeature(cFeature);
181                                                                             } else {
182                                                                                      String msg = "Skipping record because the sequence loaded (" + seq.getID() + ") doesn't encompass the entire feature: \n";
183                                                                                      msg += cFeature.getAttributes();
184                                                                                      GloDBUtils.printError(msg);
185                                                                             }
186                                                                    } else {
187                                                                             // not first Feature so add existing
188                                                                             // Feature before reseting the variables
189                                                                             // for the next Feature.  We can assume
190                                                                             // this is a valid Feature.
191                                                                             track.addFeature(cFeature);
192                                                                    }
193                                                      }
194    
195                                                      // we assume this will be a valid Feature
196                                                      validFeature = true;
197                                                                    
198                                                      // assume all features are "exact", thus just have start and
199                                                      // stop positions.
200                                                      int start = 0;
201                                                      int stop = 0;
202                                                      HashMap attributes;
203    
204                                                      // reset Sequence info
205                                                      seq = source;
206                                                      data = "";
207    
208                                                      // process source/boundaries (was call
209                                                      // "gene_boundaries" in the FASTA file) here,
210                                                      // creating a Feature.
211    
212                                                      // XXX: what is the format for the position
213                                                      // information?  can a track have more than one
214                                                      // position pair?  for example one pair looks like
215                                                      // this: (X:1,488..3,280[-]) and can there be
216                                                      // cases with more than one pair such as something
217                                                      // like this: (X:1,488..3,280;4,453..6,654[-])
218                                                      FASTAParser parser = new FASTAParserFly();
219                                                      attributes = parser.parseHeader(line);
220    
221                                                      if (! attributes.containsKey("boundaries")) {
222                                                                    String msg = "Skipping record because no feature information in header: \n";
223                                                                    msg += line;
224                                                                    GloDBUtils.printError(msg);
225                                                                    validFeature = false; // not valid Feature
226                                                                    continue;
227                                                      }
228                                                      String boundaries = (String) attributes.get("boundaries");
229    
230                                                      // get the Sequence ID and start/stop positions
231                                                      //                                              String pos[] = boundaries.split(":", 2);
232                                                      //                                              sourceID = pos[0];
233                                                      // XXX: This assumes that these Features do NOT
234                                                      // have more than one position pair.
235                                                      //                                              pos = pos[1].split("\\.\\.");
236                                                      String pos[] = boundaries.split("\\.\\.");
237    
238                                                      // add position information to the Feature object.  the
239                                                      // positions have commas and thus need to be 'parsed' and not
240                                                      // just converted from String to Integer.
241                                                      NumberFormat nf = NumberFormat.getNumberInstance();
242                                                      try {
243                                                                    start = (nf.parse(pos[0])).intValue();
244                                                                    stop = (nf.parse(pos[1])).intValue();
245                                                      } catch (Exception e) {
246                                                                    String msg = "Skipping record because unable to parse feature position information: \n";
247                                                                    msg += line;
248                                                                    GloDBUtils.printError(msg);
249                                                                    validFeature = false; // not valid Feature
250                                                                    continue;
251                                                      }
252    
253                                                      if (seq == null) {
254                                                                    // we don't have user-specified source info,
255                                                                    // so check the "gene_boundaries" Sequence ID
256                                                                    // to see if it's valid
257                                                                    if (attributes.containsKey("source")) {
258                                                                             sourceID = (String) attributes.get("source");
259                                                                             seq = ObjectHandles.getSequence(sourceID); 
260                                                                    }
261                                                                             
262                                                                    if ((seq == null) || 
263                                                                             (! seq.contains(start)) || (! seq.contains(stop))) {
264                                                                             // still haven't found a valid Sequence
265                                                                             // for this Feature, so check the ID at
266                                                                             // the beginning of the header line.  The
267                                                                             // ID was parsed by FASTAParserFly and
268                                                                             // added as an attribute.
269                                                                             if (attributes.containsKey("ID")) {
270                                                                                      sourceID = (String) attributes.get("ID");
271                                                                                      
272                                                                                      // we need to test if new source ID is valid
273                                                                                      seq = ObjectHandles.getSequence(sourceID);
274                                                                             }
275                                                                             
276                                                                             if ((seq == null) || 
277                                                                                      (! seq.contains(start)) || (! seq.contains(stop))) {
278                                                                                      // XXX If still no source info, then need
279                                                                                      // to load the source info from the file,
280                                                                                      // creating a new Sequence object.  If
281                                                                                      // sourceID is empty, then a random ID
282                                                                                      // will be created in Sequence().
283    
284                                                                                      // If we've gotten this far then the
285                                                                                      // Feature can't be associated with
286                                                                                      // any existing Sequence and so we're
287                                                                                      // creating a Sequence here.  However,
288                                                                                      // this isn't very useful because it's
289                                                                                      // not likely that other Features will
290                                                                                      // share this Sequence.  There's also
291                                                                                      // no capacity to load this Sequence
292                                                                                      // data later, so we are going to have
293                                                                                      // to load it now as well, which is
294                                                                                      // very inefficient.
295                                                                                      try {
296                                                                                                    GloDBUtils.printMsg("Source \"" + sourceID + "\" doesn't exist, loading sequence data.");
297                                                                                                    seq = new Sequence(sourceID);
298                                                                                                    seq.setAttributes(attributes);
299                                                                                      } catch (InvalidIDException e) {
300                                                                                                    // This shoud never be reached but
301                                                                                                    // is here just in case something
302                                                                                                    // goes wrong above.
303                                                                                                    String newID = Sequence.randomID("_S");
304                                                                                                    String msg = "Source \"" + sourceID + "\" already exists, using ID \"" 
305                                                                                                             + newID + "\" instead.";
306                                                                                                    GloDBUtils.printMsg(msg, GloDBUtils.WARNING);
307                                                                                                    seq = new Sequence(newID);
308                                                                                                    seq.setAttributes(attributes);
309                                                                                      }
310                                                                                      
311                                                                                      // use the Feature's start position
312                                                                                      // as the offset position for the
313                                                                                      // Sequence data
314    
315                                                                                      // XXX we should shift the feature
316                                                                                      // start/stop postions to go from 0 to
317                                                                                      // (length-offset).
318                                                                                      seq.setOffset(start);
319    
320                                                                                      // at this point we need to load the
321                                                                                      // Sequence data from the input file.
322                                                                                      loadSequenceData = true;
323                                                                             }
324                                                                    }
325                                                      }
326    
327                                                      // starting a new Feature.  'Seq' is either set
328                                                      // to 'sourceID', as provided when this method was
329                                                      // called, or to the Sequence contig loaded from
330                                                      // this FASTA file.
331                                                      cFeature = new ExactFeature(start, stop, seq); 
332                                                      // add attributes to Feature object
333                                                      cFeature.setAttributes(attributes);
334                                             } else {
335                                                      // if necessary, load Sequence/contig data
336                                                      if (loadSequenceData) data += line;
337                                             }
338                                    }
339    
340                                    // add last Feature's info
341                                    if (validFeature && (cFeature != null)) {
342                                             if (loadSequenceData) { 
343                                                      // we need to save the Sequence/contig data
344                                                      seq.setData(data);
345    
346                                                      // make sure that the Feature is valid; that is,
347                                                      // if the sequence loaded isn't large enough to
348                                                      // encompass the Feature we won't include the
349                                                      // Feature
350                                                      if (seq.contains(cFeature)) {
351                                                                    // valid Feature, so add to Track
352                                                                    track.addFeature(cFeature);
353                                                      } else {
354                                                                    String msg = "Skipping record because the sequence loaded (" + seq.getID() + ") doesn't encompass the entire feature: \n";
355                                                                    msg += cFeature.getAttributes();
356                                                                    GloDBUtils.printError(msg);
357                                                      }
358                                             } else {
359                                                      // valid Feature, so add to Track
360                                                      track.addFeature(cFeature);
361                                             }
362                                    }
363    
364                                    bReader.close();
365                      } catch (FileNotFoundException e) {
366                                    GloDBUtils.printError("File not found: " + e.getMessage());
367                                    return null;
368                      } catch (IOException e) {
369                                    GloDBUtils.printError("Error reading file: " + filename);
370                                    return null;
371                      }
372    
373                      if (track.numFeatures() == 0) {
374                                    // this assumes an empty Track is a mistake, so return null
375                                    GloDBUtils.printError("Unable to load any features from the file: " + filename);
376                                    return null;
377                      }
378    
379                      // add track to trackPool
380                      try {
381                                    ObjectHandles.addTrack(track);
382                      } catch (InvalidIDException e) {
383                                    String id_new = Track.randomID("_T");
384                                    String msg = "ID \"" + track.getID() + "\" already exists, using ID \"" + id_new + "\" instead.";
385                                    GloDBUtils.printWarning(msg);
386                                    
387                                    // add self to set of all Tracks, using new ID
388                                    track.setID(id_new, false);
389                                    ObjectHandles.addTrack(track);
390                      }
391    
392                      GloDBUtils.printMsg("Loaded FASTA file: " + filename);
393                      return track;
394             }
395    
396             /** 
397              * Save the Track to a file based on it's ID.  This will overwrite
398              * any existing file.  This will append ".fasta" to the filename.
399              */
400             public void save(String id) {
401                      // add ".fasta" filename extension, if necessary
402                      String filename = id;
403                      if ((! filename.endsWith(".fa")) && (! filename.endsWith(".fas")) 
404                                    && (! filename.endsWith(".fasta"))) {
405                                    filename += ".fasta";
406                      }
407    
408                      save(id, filename, true);
409             }
410    
411             /**
412              * Save all Features in a FASTA file.
413              *
414              * @XXX need to throw FileIO exceptions, rather than just print
415              * errors.
416              * @XXX How should the attributes be formatted?  Should we remove
417              * 'ID', 'descriptors', 'dbxref', 'strand', 'source', and
418              * 'boundaries' from the header since these were most likely added
419              * when we created the header?
420              */
421             public void save(String id, String filename, boolean overwrite) {
422                      // add ".fasta" filename extension, if necessary
423                      if ((! filename.endsWith(".fa")) && (! filename.endsWith(".fas")) 
424                                    && (! filename.endsWith(".fasta"))) {
425                                    filename += ".fasta";
426                      }
427    
428                      File file = new File(filename);
429                      // if the file already exists and not supposed to overwrite
430                      // it, then return on error.
431                      if (file.exists() && (! overwrite)) {
432                                    GloDBUtils.printError("File \"" + filename + "\" already exists.");
433                                    return;
434                      }
435    
436                      try {
437                                    Track track = ObjectHandles.getTrack(id);
438                                    if (track == null) {
439                                             GloDBUtils.printError("Track \"" + id + "\" doesn't exist.");
440                                             return;
441                                    }
442    
443                                    FileWriter fWriter = new FileWriter(file);
444                                    BufferedWriter bWriter = new BufferedWriter(fWriter);
445    
446                                    for (Iterator s = track.getSourceSet().iterator(); s.hasNext();) {
447                                             String sequenceID = (String) s.next();
448                                             Sequence sequence = (Sequence) ObjectHandles.sequencePool.get(sequenceID);
449    
450                                             // get sequence data for this source
451                                             String seqData = sequence.getData();  
452                                             int offset = sequence.getOffset();
453    
454                                             for (Iterator i = track.featuresBySource(sequenceID).iterator(); i.hasNext();) {
455                                                      Feature feature = (Feature) i.next();
456    
457                                                      // XXX should include more formatting
458                                                      String header = ">";
459                                                      
460                                                      // create a copy of the attributes so we can remove
461                                                      // objects from the HashMap as we process them below
462                                                      HashMap attribs = feature.getAttributesMap();
463                                                      
464                                                      // start with the ID attribute, if not present, then
465                                                      // use the Track's ID.
466                                                      if (attribs.containsKey("ID")) {
467                                                                    header += attribs.get("ID");
468                                                                    attribs.remove("ID");
469                                                      } else {
470                                                                    header += id;
471                                                      }
472                                                      
473                                                      // if contains 'descriptors' then remove label
474                                                      if (attribs.containsKey("descriptors")) {
475                                                                    header += " " + attribs.get("descriptors");
476                                                                    attribs.remove("descriptors");
477                                                      }
478                                                      
479                                                      // if "gene_boundaries" already exists, then we
480                                                      // probably didn't process this header and so we
481                                                      // should just leave it alone
482                                                      if (! attribs.containsKey("gene_boundaries")) {
483                                                                    String gb = "gene_boundaries:(" + feature.getSource().getID() + ":";
484                                                                    gb += feature.getStart() + ".." + feature.getStop();
485                                                                    
486                                                                    // if 'source' already handled, then remove from
487                                                                    // attribs map
488                                                                    if (attribs.containsKey("source")) {
489                                                                             String value = (String) attribs.get("source");
490                                                                             //                                                             if (value.equalsIgnoreCase(feature.getSource().getID())) {
491                                                                             if (value.equals(feature.getSource().getID())) {
492                                                                                      attribs.remove("source");
493                                                                             }
494                                                                    }
495                                                                    
496                                                                    // if 'boundaries' exists then remove from attribs
497                                                                    // map, because this should be equivalent to the
498                                                                    // Feature's start/stop
499                                                                    attribs.remove("boundaries");
500                                                                    
501                                                                    if (attribs.containsKey("strand")) {
502                                                                             gb += "[" + (String) attribs.get("strand") + "]";
503                                                                             attribs.remove("strand");  // don't need anymore
504                                                                    }
505                                                                    gb += ")";
506                                                                    header += " " + gb;
507                                                      }
508                                                      
509                                                      // if contains 'dbxref' then remove label and enclose
510                                                      // in '()'
511                                                      if (attribs.containsKey("dbxref")) {
512                                                                    header += " (";
513                                                                    Set dbxref = (Set) attribs.get("dbxref");
514                                                                    for (Iterator dI = dbxref.iterator(); dI.hasNext();) {
515                                                                             header += dI.next();
516                                                                    }
517                                                                    header += ")";
518                                                                    attribs.remove("dbxref");
519                                                      }
520                                                      
521                                                      // add remaining attributes to the header
522                                                      for (Iterator l = (attribs.keySet()).iterator(); l.hasNext();) {
523                                                                    String key = (String) l.next();
524                                                                    header += " " + key + ":" + attribs.get(key);
525                                                      }
526                                                      
527                                                      bWriter.write(header);
528                                                      bWriter.newLine();
529                                                      
530                                                      // if offset = 0, then not set so need to adjust for
531                                                      // sequence starting at 1 and String starting at 0.  If
532                                                      // offset is set, then 
533                                                      int start = feature.getStart();
534                                                      int stop = feature.getStop();
535                                                      if (offset == 0) {
536                                                                    start -= 1;
537                                                      } else {
538                                                                    start = start - offset;
539                                                                    stop = (stop - offset) + 1;
540                                                      }
541    
542                                                      // output to file with Sequence.FORMAT_WIDTH
543                                                      // characters per line
544                                                      String boundedData = seqData.substring(start, stop);
545                                                      int dataLen = boundedData.length();
546                                                      int idx = Sequence.FORMAT_WIDTH;
547                                                      while (idx < dataLen) {
548                                                                    bWriter.write(boundedData.substring(idx - Sequence.FORMAT_WIDTH, idx) + "\n");
549                                                                    idx += Sequence.FORMAT_WIDTH;
550                                                      }
551                                                      if (idx >= dataLen) bWriter.write(boundedData.substring(idx - Sequence.FORMAT_WIDTH, 
552                                                                                                                                                                                                             dataLen));
553                                                      bWriter.newLine();
554                                             }
555                                    }
556    
557                                    bWriter.newLine();
558                                    bWriter.flush();
559                                    bWriter.close();
560                      } catch (FileNotFoundException e) {
561                                    // problem with FileOutputStream
562                                    GloDBUtils.printError("File \"" + filename + "\" can not be opened.");
563                      } catch (IOException e) {
564                                    // problem with ObjectOutputStream.  XXX do we need to
565                                    // close 'oStream'?
566                                    GloDBUtils.printError("Error writting output file \"" + filename + "\".");
567                      }
568             }
569    
570             /** 
571              * FASTA specific FileFilter. 
572              * @XXX This should use EXT.
573              */
574             private class FASTAFilter extends FileFilter {
575                      public boolean accept(File f) {
576                                    // accept directories
577                                    if (f.isDirectory()) return true;
578                                    
579                                    // if true, then don't filter by file extensions.
580                                    if (GUIUtils.showAllFiles()) return true;
581    
582                                    // accept files ending in '.fasta' or '.fas' or '.fa'
583                                    if ((f.getName()).endsWith(".fasta")) return true;
584                                    if ((f.getName()).endsWith(".fas")) return true;
585                                    if ((f.getName()).endsWith(".fa")) return true;
586    
587                                    return false;
588                      }
589                      
590                      // set the filter's description
591                      public String getDescription() { return DESC; }
592             }
593    
594    } // FASTATrack.java
595    
596