001    /*
002     * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of
003     * Pennsylvania.
004     *
005     * This file is part of Glo-DB.
006     * 
007     * Glo-DB is free software: you can redistribute it and/or modify it
008     * under the terms of the GNU General Public License as published by
009     * the Free Software Foundation, either version 3 of the License, or
010     * (at your option) any later version.
011     * 
012     * Glo-DB is distributed in the hope that it will be useful, but
013     * WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * General Public License for more details.
016     * 
017     * You should have received a copy of the GNU General Public License
018     * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>.
019     *
020     * @(#)GFFTrack.java
021     */
022    
023    package edu.upenn.gloDB.io;
024    
025    import edu.upenn.gloDB.*;
026    import edu.upenn.gloDB.gui.GUIUtils;
027    import java.io.*;
028    import java.util.HashMap;
029    import java.util.Iterator;
030    import javax.swing.filechooser.FileFilter;
031    
032    /**
033     * Import/Export Track data from/to GFF files. <br><br>
034     *
035     * File format (each column is separated by a tab character '\t'):
036     * <table border="1">
037     * <tr><td>seqname</td><td>source</td><td>feature</td><td>start</td><td>end</td><td>score</td><td>strand</td><td>frame</td><td>[attributes]</td></tr>
038     * </table><br>
039     *
040     * Examples:
041     * <table border="1">
042     * <tr><td>SEQ1</td><td>EMBL</td><td>splice5</td><td>172</td><td>173</td><td>.</td><td>+</td><td>.</td><td>&nbsp;</td></tr> 
043     * <tr><td>SEQ1</td><td>netgene</td><td>splice5</td><td>172</td><td>173</td><td>0.94</td><td>+</td><td>.</td><td>&nbsp;</td></tr>
044     * <tr><td>SEQ1</td><td>genie</td><td>sp5-20</td><td>163</td><td>182</td><td>2.3</td><td>+</td><td>.</td><td>&nbsp;</td></tr> 
045     * <tr><td>SEQ2</td><td>grail</td><td>ATG</td><td>17</td><td>19</td><td>2.1</td><td>-</td><td>0</td><td>&nbsp;</td></tr> 
046     * <tr><td>seq1</td><td>BLASTX</td><td>similarity</td><td>101</td><td>235</td><td>87.1</td><td>+</td><td>0</td><td>Target "HBA_HUMAN" 11 55 ; E_value 0.0003</td></tr>
047     * <tr><td>dJ102G20</td><td>GD_mRNA</td><td>coding_exon</td><td>7105</td><td>7201</td><td>.</td><td>-</td><td>2</td><td>Sequence "dJ102G20.C1.1"</td></tr>
048     * <tr><td>X</td><td>gadfly</td><td>exon</td><td>3118</td><td>3280</td><td>.</td><td>-</td><td>.</td><td>genegrp=CG3038; transgrp=CG3038-RB; name=CG3038:1</td></tr>
049     * <tr><td>X</td><td>gadfly</td><td>exon</td><td>2850</td><td>3016</td><td>.</td><td>-</td><td>.</td><td>genegrp=CG3038; transgrp=CG3038-RB; name=CG3038:2</td></tr>
050     * </table>
051     *
052     * @author  Stephen Fisher
053     * @version $Id: GFFTrack.java,v 1.1.2.23 2007/02/22 21:10:27 fisher Exp $
054     */
055    
056    public class GFFTrack implements TrackFile {
057    
058             private final int ID = FileIO.GFF;
059             private final String DESC = "GFF files (*.gff)";
060             private final String[] EXT = {".gff"};
061             private final FileFilter fileFilter = new GFFFilter();
062    
063        //--------------------------------------------------------------------------
064        // Setters and Getters
065    
066             public int getID() { return ID; }
067    
068             public String getDesc() { return DESC; }
069    
070             public String[] getExt() { return EXT; }
071    
072             public FileFilter getFileFilter() { return fileFilter; }
073    
074        //--------------------------------------------------------------------------
075        // Miscellaneous Methods
076    
077             /**
078              * Load all Features in the GFF file into a single Track and
079              * return the resulting Track object.  
080              */
081             public Track load(String filename) {
082                      return load(filename, "");
083             }
084    
085             /**
086              * Load all Features in the GFF file into a single Track and
087              * return the resulting Track object.
088              *
089              * @XXX need to throw FileIO exceptions, rather than just print
090              * errors.
091              */
092             public Track load(String filename, String sourceID) {
093                      // when creating Track's ID, if necessary, remove ".gff"
094                      // filename extension.  Use File() to remove any path info
095                      // from the filename so that the ID is just the name.
096                      File file = new File(filename);
097                      String id = file.getName();
098                      if (id.endsWith(".gff")) id = id.substring(0, id.length()-4);
099    
100    
101                      Track track = new Track(false, id);
102                      Sequence source = null;
103                      if (! GloDBUtils.isEmpty(sourceID)) {
104                                    source = ObjectHandles.getSequence(sourceID);
105                                    if (source == null) {
106                                             String msg = "The source ID \"" + sourceID + "\" isn't valid.";
107                                             msg += "Source IDs will be set from the feature headers.";
108                                             GloDBUtils.printMsg(msg, GloDBUtils.WARNING);
109                                    }
110                      }
111    
112                      try {
113                                    BufferedReader bReader = new BufferedReader(new FileReader(file));
114                                    String line;
115    
116                                    StringBuffer attributes = new StringBuffer();
117                                    while ((line = bReader.readLine()) != null) {
118                                             line = line.trim();
119    
120                                             // skip all comment ('#') lines and lines that are
121                                             // empty. Don't use GloDBUtils.isEmpty() because it's
122                                             // redundent processing (trims and tests for null).
123                                             if ((! line.startsWith("#")) && (line.length() > 0)) {
124                                                      // split line at every tab ('\t')
125                                                      String[] fields = line.split("        ");
126                                                      
127                                                      // get a reference to the Sequence for this
128                                                      // Feature.  If source already exists then use
129                                                      // that, else try the first field ('seqName').
130                                                      Sequence seqRef;
131                                                      if (source == null) {
132                                                                    seqRef = ObjectHandles.getSequence(fields[0]);
133                                                                    if (seqRef == null) {
134                                                                             if (true) {
135                                                                                      seqRef = new Sequence(fields[0]);
136                                                                                      GloDBUtils.printWarning("Sequence not found, so created empty Sequence with ID: " 
137                                                                                                                                                     + fields[0]);
138                                                                             } else {
139                                                                                      GloDBUtils.printError("Skipping feature because sequence not found: " 
140                                                                                                                                              + fields[0]);
141                                                                                      continue;
142                                                                             }
143                                                                    }
144                                                      } else {
145                                                                    seqRef = source;
146                                                      }
147    
148                                                      // create a new Feature object
149                                                      Feature feature = new ExactFeature(Integer.parseInt(fields[3]), 
150                                                                                                                                                     Integer.parseInt(fields[4]), seqRef); 
151    
152                                                      // get Feature attributes
153                                                      attributes.setLength(0); // erase buffer
154                                                      attributes.append("source=" + fields[1]);     // get source
155                                                      attributes.append(";feature=" + fields[2]);   // get feature label
156                                                      attributes.append(";score=" + fields[5]);     // get score
157                                                      attributes.append(";strand=" + fields[6]);    // get strand
158                                                      attributes.append(";frame=" + fields[7]);     // get frame
159                                                      if (fields.length > 8) {                // get attributes
160                                                                    // this will contain tag/value pairs. Since we
161                                                                    // use ';' as key/value delimiter, we need to
162                                                                    // make sure fields[8] doesn't also contain
163                                                                    // this delimiter.
164                                                                    attributes.append(";attributes=" + fields[8].replace(';', ','));
165                                                                    /*
166                                                                    StringTokenizer tokens = new StringTokenizer(fields[8], ";");
167    
168                                                                    while (tokens.hasMoreTokens()) {
169                                                                             String attrib = tokens.nextToken().trim();
170                                                                             String[] key_value = attrib.split(" ", 2);
171    
172                                                                             if (key_value.length > 1) {
173                                                                                      attributes.put(key_value[0], key_value[1]);
174                                                                             } else {
175                                                                                      // test if uses '=' as delimiter
176                                                                                      // instead of ' '
177                                                                                      key_value = attrib.split("=", 2);
178                                                                                      if (key_value.length > 1) {
179                                                                                                    attributes.put(key_value[0], key_value[1]);
180                                                                                      } else {
181                                                                                                    // still can't parse the
182                                                                                                    // attributes, so add all
183                                                                                                    // attributes as one item
184                                                                                                    attributes.put("attributes", fields[8]);
185                                                                                                    break;
186                                                                                      }
187                                                                             }
188                                                                    }
189                                                                    */
190                                                      }
191                                                      feature.setAttributes(attributes.toString());
192    
193                                                      // add the Feature object to the Track
194                                                      track.addFeature(feature);
195                                             }
196                                    }
197    
198                                    bReader.close();
199                      } catch (FileNotFoundException e) {
200                                    GloDBUtils.printError("File not found: " + e.getMessage());
201                                    return null;
202                      } catch (IOException e) {
203                                    GloDBUtils.printError("Error reading file: " + filename);
204                                    return null;
205                      }
206    
207                      if (track.numFeatures() == 0) {
208                                    // this assumes an empty Track is a mistake, so return null
209                                    GloDBUtils.printError("Unable to load any features from the file: " + filename);
210                                    return null;
211                      }
212    
213                      // add track to trackPool
214                      try {
215                                    ObjectHandles.addTrack(track);
216                      } catch (InvalidIDException e) {
217                                    String id_new = Track.randomID("_T");
218                                    String msg = "ID \"" + track.getID() + "\" already exists, using ID \"" + id_new + "\" instead.";
219                                    GloDBUtils.printWarning(msg);
220                                    
221                                    // add self to set of all Tracks, using new ID
222                                    track.setID(id_new, false);
223                                    ObjectHandles.addTrack(track);
224                      }
225    
226                      GloDBUtils.printMsg("Loaded GFF file: " + filename);
227                      return track;
228             }
229    
230             /** 
231              * Save the Track to a file based on it's ID.  This will overwrite
232              * any existing file.  This will append ".gff" to the filename.
233              */
234             public void save(String id) {
235                      // add ".gff" filename extension, if necessary
236                      String filename = id;
237                      if (! filename.endsWith(".gff")) filename += ".gff";
238    
239                      save(id, filename, true);
240             }
241    
242             /**
243              * Save all Features in a GFF file.
244              *
245              * @XXX need to throw FileIO exceptions, rather than just print
246              * errors.
247              * @XXX Should offer option to include Sequence data.
248              */
249             public void save(String id, String filename, boolean overwrite) {
250                      // if empty filename, then exit
251                      if (filename.length() == 0) return;
252    
253                      // add ".gff" filename extension, if necessary
254                      if (! filename.endsWith(".gff")) filename += ".gff";
255    
256                      File file = new File(filename);
257                      // if the file already exists and not supposed to overwrite
258                      // it, then return on error.
259                      if (file.exists() && (! overwrite)) {
260                                    GloDBUtils.printError("File \"" + filename + "\" already exists.");
261                                    return;
262                      }
263    
264                      // counters for potential errors in output file
265                      int featureLabelErrors = 0;
266                      int strandLabelErrors = 0;
267    
268                      try {
269                                    Track track = ObjectHandles.getTrack(id);
270                                    if (track == null) {
271                                             GloDBUtils.printError("Track \"" + id + "\" doesn't exist.");
272                                             return;
273                                    }
274    
275                                    FileWriter fWriter = new FileWriter(file);
276                                    BufferedWriter bWriter = new BufferedWriter(fWriter);
277    
278                                    for (Iterator i = track.featureIterator(); i.hasNext();) {
279                                             Feature feature = (Feature) i.next();
280    
281                                             // create a copy of the attributes so we can remove
282                                             // objects from the HashMap as we process them below
283                                             HashMap attribs = feature.getAttributesMap();
284    
285                                             // add sequence ID
286                                             String line = feature.getSourceID();
287    
288                                             // add source info
289                                             if (attribs.containsKey("source")) {
290                                                      line += "\t" + attribs.get("source");
291                                                      attribs.remove("source");
292                                             } else {
293                                                      // if no source attribute then we are the source
294                                                      line += "\tGloDB";
295                                             }                                                
296    
297                                             // add feature label
298                                             if (attribs.containsKey("feature")) {
299                                                      line += "\t" + attribs.get("feature");
300                                                      attribs.remove("feature");
301                                             } else {
302                                                      // if no feature attribute then use the track ID.
303                                                      // XXX this is probably not correct
304                                                      line += "\t" + id;
305                                                      featureLabelErrors++;
306                                             }                                                
307    
308                                             // add start/stop info
309                                             line += "\t" + feature.getStart();
310                                             line += "\t" + feature.getStop();
311    
312                                             // add score info
313                                             if (attribs.containsKey("score")) {
314                                                      line += "\t" + attribs.get("score");
315                                                      attribs.remove("score");
316                                             } else {
317                                                      // if no score attribute then use '.'
318                                                      line += "\t.";
319                                             }                                                
320    
321                                             // add strand info
322                                             if (attribs.containsKey("strand")) {
323                                                      line += "\t" + attribs.get("strand");
324                                                      attribs.remove("strand");
325                                             } else {
326                                                      // if no strand attribute then use '.'
327                                                      // XXX this is probably not correct
328                                                      line += "\t+";
329                                                      strandLabelErrors++;
330                                             }                                                
331    
332                                             // add frame info
333                                             if (attribs.containsKey("frame")) {
334                                                      line += "\t" + attribs.get("frame");
335                                                      attribs.remove("frame");
336                                             } else {
337                                                      // if no frame attribute then use '.'
338                                                      line += "\t.";
339                                             }                                                
340    
341                                             // add attributes info
342                                             if (attribs.containsKey("attributes")) {
343                                                      line += "\t" + attribs.get("attributes");
344                                                      attribs.remove("attributes");
345                                             }                                                
346    
347                                             // add remaining attributes
348                                             for (Iterator l = (attribs.keySet()).iterator(); l.hasNext();) {
349                                                      String key = (String) l.next();
350                                                      line += "; " + key + " " + attribs.get(key);
351                                             }
352    
353                                             bWriter.write(line);
354                                             bWriter.newLine();
355                                    }
356    
357                                    bWriter.flush();
358                                    bWriter.close();
359                      } catch (FileNotFoundException e) {
360                                    // problem with FileOutputStream
361                                    GloDBUtils.printError("File \"" + filename + "\" can not be opened.");
362                      } catch (IOException e) {
363                                    // problem with ObjectOutputStream.  XXX do we need to
364                                    // close bWriter()?
365                                    GloDBUtils.printError("Error writting output file \"" + filename + "\".");
366                      }
367    
368                      if (featureLabelErrors > 0) {
369                                    String msg = "Number of feature labels not found: " + featureLabelErrors + "\n";
370                                    msg += "     Used \"" + id + "\" instead.";
371                                    GloDBUtils.printError(msg);
372                      }
373    
374                      if (strandLabelErrors > 0) {
375                                    String msg = "Strand attribute not found: " + strandLabelErrors + "\n";
376                                    msg += "     Used \"+\" instead.";
377                                    GloDBUtils.printError(msg);
378                      }
379             }
380    
381             /** Format all Features into a GFF like string. */
382             public String format(String id) {
383                      // counters for potential errors in output file
384                      int featureLabelErrors = 0;
385                      int strandLabelErrors = 0;
386    
387                      Track track = ObjectHandles.getTrack(id);
388                      if (track == null) {
389                                    GloDBUtils.printError("Not a valid track");
390                                    return "";
391                      }
392    
393                      String out = "";
394    
395                      for (Iterator i = track.featureIterator(); i.hasNext();) {
396                                    Feature feature = (Feature) i.next();
397    
398                                    // create a copy of the attributes so we can remove
399                                    // objects from the HashMap as we process them below
400                                    HashMap attribs = feature.getAttributesMap();
401                                    
402                                    // add sequence ID
403                                    String line = feature.getSourceID();
404                                    
405                                    // add source info
406                                    if (attribs.containsKey("source")) {
407                                             line += "\t" + attribs.get("source");
408                                             attribs.remove("source");
409                                    } else {
410                                             // if no source attribute then we are the source
411                                             line += "\tGloDB";
412                                    }                                                 
413                                    
414                                    // add feature label
415                                    if (attribs.containsKey("feature")) {
416                                             line += "\t" + attribs.get("feature");
417                                             attribs.remove("feature");
418                                    } else {
419                                             // if no feature attribute then use the track ID.
420                                             // XXX this is probably not correct
421                                             line += "\t" + id;
422                                             featureLabelErrors++;
423                                    }                                                 
424                                    
425                                    // add start/stop info
426                                    line += "\t" + feature.getStart();
427                                    line += "\t" + feature.getStop();
428                                    
429                                    // add score info
430                                    if (attribs.containsKey("score")) {
431                                             line += "\t" + attribs.get("score");
432                                             attribs.remove("score");
433                                    } else {
434                                             // if no score attribute then use '.'
435                                             line += "\t.";
436                                    }                                                 
437                                    
438                                    // add strand info
439                                    if (attribs.containsKey("strand")) {
440                                             line += "\t" + attribs.get("strand");
441                                             attribs.remove("strand");
442                                    } else {
443                                             // if no strand attribute then use '.'
444                                             // XXX this is probably not correct
445                                             line += "\t+";
446                                             strandLabelErrors++;
447                                    }                                                 
448                                    
449                                    // add frame info
450                                    if (attribs.containsKey("frame")) {
451                                             line += "\t" + attribs.get("frame");
452                                             attribs.remove("frame");
453                                    } else {
454                                             // if no frame attribute then use '.'
455                                             line += "\t.";
456                                    }                                                 
457                                    
458                                    // add attributes info
459                                    if (attribs.containsKey("attributes")) {
460                                             line += "\t" + attribs.get("attributes");
461                                             attribs.remove("attributes");
462                                    }                                                 
463                                    
464                                    // add remaining attributes
465                                    for (Iterator l = (attribs.keySet()).iterator(); l.hasNext();) {
466                                             String key = (String) l.next();
467                                             line += "; " + key + " " + attribs.get(key);
468                                    }
469                                    
470                                    out += line + "\n";
471                      }
472                      
473                      if (featureLabelErrors > 0) {
474                                    String msg = "Number of feature labels not found: " + featureLabelErrors + "\n";
475                                    msg += "     Used \"" + id + "\" instead.";
476                                    GloDBUtils.printError(msg);
477                      }
478    
479                      if (strandLabelErrors > 0) {
480                                    String msg = "Strand attribute not found: " + strandLabelErrors + "\n";
481                                    msg += "     Used \"+\" instead.";
482                                    GloDBUtils.printError(msg);
483                      }
484    
485                      return out;
486             }
487    
488             /** 
489              * GFF specific FileFilter. 
490              * @XXX This should use EXT.
491              */
492             private class GFFFilter extends FileFilter {
493                      public boolean accept(File f) {
494                                    // accept directories
495                                    if (f.isDirectory()) return true;
496                                    
497                                    // if true, then don't filter by file extensions.
498                                    if (GUIUtils.showAllFiles()) return true;
499    
500                                    // accept files ending in '.gff'
501                                    if ((f.getName()).endsWith(".gff")) return true;
502    
503                                    return false;
504                      }
505                      
506                      // set the filter's description
507                      public String getDescription() { return DESC; }
508             }
509    
510    } // GFFTrack.java
511    
512