001    /*
002     * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of
003     * Pennsylvania.
004     *
005     * This file is part of Glo-DB.
006     * 
007     * Glo-DB is free software: you can redistribute it and/or modify it
008     * under the terms of the GNU General Public License as published by
009     * the Free Software Foundation, either version 3 of the License, or
010     * (at your option) any later version.
011     * 
012     * Glo-DB is distributed in the hope that it will be useful, but
013     * WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * General Public License for more details.
016     * 
017     * You should have received a copy of the GNU General Public License
018     * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>.
019     *
020     * @(#)FASTASequence.java
021     */
022    
023    package edu.upenn.gloDB.io;
024    
025    import edu.upenn.gloDB.*;
026    import edu.upenn.gloDB.gui.GUIUtils;
027    import java.io.*;
028    import java.util.HashSet;
029    import java.util.HashMap;
030    import javax.swing.filechooser.FileFilter;
031    
032    /**
033     * Import Sequence data from a FASTA file.  The basic file format
034     * dictates a header line at the beginning of each sequence.  There
035     * are no standards as to what the header line should contain or how
036     * it should be formatted, other than to stipulate that it begins with
037     * a ">".  Thus this format sufficient for coding Sequence objects but
038     * not ideal for sequence annotations (Features).  Since some sites,
039     * such as www.fruitfly.org, release annotations as FASTA files, some
040     * attempt has been made to parse the headers from specific sites.
041     * Users can use the FASTAParser interface to create their own header
042     * parsers as well.
043     *
044     * @XXX can we assume that the header starts with a Sequence ID?
045     *
046     * @author  Stephen Fisher
047     * @version $Id: FASTASequence.java,v 1.31.2.16 2007/03/01 21:17:33 fisher Exp $
048     */
049    
050    public class FASTASequence implements SequenceFile, SequenceLoader {
051    
052             private final int ID = FileIO.FASTA;
053             private final String DESC = "FASTA files (*.fa; *.fas; *.fasta)";
054             private final String[] EXT = {".fa", ".fas", ".fasta"};
055             private final FileFilter fileFilter = new FASTAFilter();
056    
057        //--------------------------------------------------------------------------
058        // Setters and Getters
059    
060             public int getID() { return ID; }
061    
062             public String getDesc() { return DESC; }
063    
064             public String[] getExt() { return EXT; }
065    
066             public FileFilter getFileFilter() { return fileFilter; }
067    
068        //--------------------------------------------------------------------------
069        // Miscellaneous Methods
070    
071             /** 
072              * Return the Sequence data as a String.  This assumes a single
073              * Sequence per file.
074              *
075              * @XXX This should allow for FASTA files with more than one
076              * Sequence -- can use headers to find correct data.
077              */
078             public String getData(HashMap loaderArgs) {
079                      String filename = (String) loaderArgs.get("filename");
080                      if (GloDBUtils.isEmpty(filename)) {
081                                    GloDBUtils.printError("Empty file name, can't load source data.");
082                                    return "";
083                      }
084    
085                      try {
086                                    BufferedReader bReader = new BufferedReader(new FileReader(filename));
087    
088                                    String line;
089    
090                                    // load sequence header
091                                    if ((line = bReader.readLine()) != null) {
092                                             // make sure header exists
093                                             if (line.startsWith(">")) {
094                                                      GloDBUtils.printMsg("Loading: " + filename);
095                                             } else {
096                                                      GloDBUtils.printError("File does not contain a header: " + filename);
097                                                      bReader.close();
098                                                      return null;
099                                             }
100                                    } else {
101                                             GloDBUtils.printError("Empty file: " + filename);
102                                             bReader.close();
103                                             return null;
104                                    }
105    
106    
107                                    int fileLength = (int) (new File(filename)).length();
108    
109                                    // Send output to the command line because the GUI hangs
110                                    // during the loading.
111                                    System.out.print("Working:...");
112                                    int cnt = 0;
113                                    int interval = (fileLength) / 500;
114                                    if (interval == 0) interval = 1;
115    
116                                    // load sequence data. We use a StringBuffer for the
117                                    // loading because this entails a lot of concatinations
118                                    // which are very slow to perform on String objects but
119                                    // very fast for StringBuffers.
120                                    StringBuffer sb = new StringBuffer(fileLength);
121                                    while ((line = bReader.readLine()) != null) {
122                                             // stop if reach another sequence
123                                             if (line.startsWith(">")) break;
124                                             sb.append(line);
125                                             if ((cnt++ % interval) == 0) System.out.print(".");
126                                    }
127                                    System.out.println("");
128    
129                                    bReader.close();
130                                    return sb.toString();
131                      } catch (FileNotFoundException e) {
132                                    GloDBUtils.printError("File not found: " + e.getMessage());
133                                    return null;
134                      } catch (IOException e) {
135                                    GloDBUtils.printError("Error reading file: " + filename);
136                                    return null;
137                      }
138             }
139    
140             /**
141              * Load the first sequence in the FASTA file and return the
142              * resulting Sequence object.
143              */
144             public Sequence load(String filename) {
145                      return load(filename, "", new FASTAParserMinimal());
146             }
147    
148             /**
149              * Load the first sequence in the FASTA file and return the
150              * resulting Sequence object.
151              */
152             public Sequence load(String filename, String id) {
153                      return load(filename, id, new FASTAParserMinimal());
154             }
155    
156             /**
157              * Load the first sequence in the FASTA file and return the
158              * resulting Sequence object.
159              *
160              * @XXX need to throw FileIO exceptions, rather than just print
161              * errors.
162              */
163             public Sequence load(String filename, String id, FASTAParser parser) {
164                      File file = new File(filename);
165    
166                      if (id.length() == 0) {
167                                    // Need to create Sequence's ID. Remove ".fasta" or ".fas"
168                                    // or ".fa" filename extension, if present.  Use File() to
169                                    // remove any path info from the filename so that the ID
170                                    // is just the name.
171                                    id = file.getName();
172                                    if (id.endsWith(".fasta")) id = id.substring(0, id.length()-6);
173                                    else if (id.endsWith(".fas")) id = id.substring(0, id.length()-4);
174                                    else if (id.endsWith(".fa")) id = id.substring(0, id.length()-3);
175                      }
176    
177                      Sequence sequence = new Sequence(id); 
178                      //              String data = "";  // store sequence data as read from file.
179    
180                      try {
181                                    BufferedReader bReader = new BufferedReader(new FileReader(file));
182    
183                                    String line;
184    
185                                    // load sequence header
186                                    if ((line = bReader.readLine()) != null) {
187                                             // make sure header exists
188                                             if (line.startsWith(">")) {
189                                                      GloDBUtils.printMsg("Loading: " + filename);
190                                                      sequence.setAttributes(parser.parseHeader(line));
191                                                      // setup parameters to load data later, if necessary
192                                                      sequence.setDataLoader(this);
193                                                      HashMap loaderArgs = new HashMap();
194                                                      loaderArgs.put("filename", filename);
195                                                      sequence.setLoaderArgs(loaderArgs);
196                                             } else {
197                                                      // since didn't correctly load file, remove the
198                                                      // Sequence we just created
199                                                      ObjectHandles.removeSequence(sequence);  
200    
201                                                      GloDBUtils.printError("File doesn't contain a header: " + filename);
202                                                      bReader.close();
203                                                      return null;
204                                             }
205                                    } else {
206                                             // since didn't correctly load file, remove the
207                                             // Sequence we just created
208                                             ObjectHandles.removeSequence(sequence);  
209    
210                                             GloDBUtils.printError("Empty file: " + filename);
211                                             bReader.close();
212                                             return null;
213                                    }
214    
215                                    /*
216                                    // load sequence data
217                                    while ((line = bReader.readLine()) != null) {
218                                             // stop if reach another sequence
219                                             if (line.startsWith(">")) break;
220                                             data += line;
221                                    }
222                                    sequence.setData(data);
223                                    */
224    
225                                    bReader.close();
226                      } catch (FileNotFoundException e) {
227                                    GloDBUtils.printError("File not found: " + e.getMessage());
228                                    return null;
229                      } catch (IOException e) {
230                                    GloDBUtils.printError("Error reading file: " + filename);
231                                    return null;
232                      }
233    
234                      return sequence;
235             }
236    
237             /**
238              * Load all Sequences in the FASTA file and return a Set
239              * containing the resulting Sequence objects.  
240              *
241              * @XXX need to throw FileIO exceptions, rather than just print
242              * errors.
243              */
244             public HashSet loadAll(String filename) {
245                      return loadAll(filename, new FASTAParserMinimal());
246             }
247    
248             /**
249              * Load all Sequences in the FASTA file and return a set
250              * containing the resulting Sequence objects.
251              *
252              * @XXX need to throw FileIO exceptions, rather than just print
253              * errors.
254              */
255             public HashSet loadAll(String filename, FASTAParser parser) {
256                      HashSet sequences = new HashSet();  // set of loaded sequences
257    
258                      // make sure we have at least a minimal parser
259                      if (parser == null) parser = new FASTAParserMinimal();
260    
261                      try {
262                                    BufferedReader bReader = new BufferedReader(new FileReader(filename));
263    
264                                    String line;
265                                    boolean firstSequence = true;
266    
267                                    Sequence seq = null;
268                                    //                              String data = "";  // store sequence data as read from file.
269    
270                                    while ((line = bReader.readLine()) != null) {
271                                             // test for a sequence header
272                                             if (line.startsWith(">")) {
273                                                      if (! firstSequence) {
274                                                                    // not first sequence so append existing sequence before
275                                                                    // reseting the variables for the next sequence.
276                                                                    //                                                              seq.setData(data);
277                                                                    sequences.add(seq);
278                                                      } else {
279                                                                    firstSequence = false;
280                                                      }
281                                                                    
282                                                      HashMap attributes = parser.parseHeader(line);
283                                                      String id = (String) attributes.get("ID");
284                                                      if (id == null) id = "";
285                                                      seq = new Sequence(id); 
286                                                      seq.setAttributes(attributes);
287                                                      // setup parameters to load data later, if necessary
288                                                      seq.setDataLoader(this);
289                                                      HashMap loaderArgs = new HashMap();
290                                                      loaderArgs.put("filename", filename);
291                                                      seq.setLoaderArgs(loaderArgs);
292    
293                                                      /*
294                                                             // THIS CODE REMOVED BECAUSE IT SETS ID AFTER CREATING SEQUENCE
295                                                             // CURRENTLY NOT ALLOWING RENAMING OF SEQUENCES
296                                                      // starting a new sequence
297                                                      seq = new Sequence(); 
298                                                      seq.setAttributes(parser.parseHeader(line));
299                                                      // setup parameters to load data later, if necessary
300                                                      seq.setDataLoader(this);
301                                                      HashMap loaderArgs = new HashMap();
302                                                      loaderArgs.put("filename", filename);
303                                                      seq.setLoaderArgs(loaderArgs);
304                                                      try {
305                                                                    String id = (String) seq.getAttribute("ID");
306    
307                                                                    if (id != null) seq.setID(id);
308                                                                    else seq.setID(Sequence.randomID("_S"));
309                                                      } catch (InvalidIDException e) {
310                                                                    String id = Sequence.randomID("_S");
311                                                                    String msg = "WARNING: ID \"" + seq.getAttribute("ID")
312                                                                             + "\" already exists, using ID \"" + id + "\" instead.";
313                                                                    GloDBUtils.printMsg(msg);
314                                                                    seq.setID(id);
315                                                      }
316                                                      */
317    
318                                                      //                                              data = "";  // store sequence data as read from file.
319                                             } else {
320                                                      // load sequence data
321                                                      //                                              data += line;
322                                             }
323                                    }
324                                    
325                                    if (seq != null) {
326                                             // add last sequences info
327                                             //                                      seq.setData(data);
328                                             sequences.add(seq);
329                                    }
330    
331                                    bReader.close();
332                      } catch (FileNotFoundException e) {
333                                    GloDBUtils.printError("File not found: " + e.getMessage());
334                                    return null;
335                      } catch (IOException e) {
336                                    // XXX we should probably remove the sequences from the
337                                    // sequence pool here
338                                    GloDBUtils.printError("Error reading file: " + filename);
339                                    return null;
340                      }
341    
342                      return sequences;
343             }
344    
345             /** 
346              * Save the Seqeuence to a file based on it's ID.  This will
347              * overwrite any existing file.  This will append ".fasta" to the
348              * filename.
349              */
350             public void save(String id) {
351                      // add ".fasta" filename extension, if necessary
352                      String filename = id;
353                      if ((! filename.endsWith(".fa")) && (! filename.endsWith(".fas")) 
354                                    && (! filename.endsWith(".fasta"))) {
355                                    filename += ".fasta";
356                      }
357    
358                      save(id, filename, true);
359             }
360    
361             /**
362              * Save the Sequence data.  This will make sure the data is loaded
363              * prior to saving the Sequence.
364              */
365             public void save(String id, String filename, boolean overwrite) {
366                      GloDBUtils.printMsg("Saving 'FASTA' sequence files not yet supported.");
367             }
368    
369             public String toString() { return "FASTA File Loader"; }
370    
371             /** 
372              * FASTA specific FileFilter. 
373              * @XXX This should use EXT.
374              */
375             private class FASTAFilter extends FileFilter {
376                      public boolean accept(File f) {
377                                    // accept directories
378                                    if (f.isDirectory()) return true;
379                                    
380                                    // if true, then don't filter by file extensions.
381                                    if (GUIUtils.showAllFiles()) return true;
382    
383                                    // accept files ending in '.fasta' or '.fas' or '.fa'
384                                    if ((f.getName()).endsWith(".fasta")) return true;
385                                    if ((f.getName()).endsWith(".fas")) return true;
386                                    if ((f.getName()).endsWith(".fa")) return true;
387    
388                                    return false;
389                      }
390                      
391                      // set the filter's description
392                      public String getDescription() { return DESC; }
393             }
394    
395    } // FASTASequence.java
396    
397