001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)GFFTrack.java 021 */ 022 023 package edu.upenn.gloDB.io; 024 025 import edu.upenn.gloDB.*; 026 import edu.upenn.gloDB.gui.GUIUtils; 027 import java.io.*; 028 import java.util.HashMap; 029 import java.util.Iterator; 030 import javax.swing.filechooser.FileFilter; 031 032 /** 033 * Import/Export Track data from/to GFF files. <br><br> 034 * 035 * File format (each column is separated by a tab character '\t'): 036 * <table border="1"> 037 * <tr><td>seqname</td><td>source</td><td>feature</td><td>start</td><td>end</td><td>score</td><td>strand</td><td>frame</td><td>[attributes]</td></tr> 038 * </table><br> 039 * 040 * Examples: 041 * <table border="1"> 042 * <tr><td>SEQ1</td><td>EMBL</td><td>splice5</td><td>172</td><td>173</td><td>.</td><td>+</td><td>.</td><td> </td></tr> 043 * <tr><td>SEQ1</td><td>netgene</td><td>splice5</td><td>172</td><td>173</td><td>0.94</td><td>+</td><td>.</td><td> </td></tr> 044 * <tr><td>SEQ1</td><td>genie</td><td>sp5-20</td><td>163</td><td>182</td><td>2.3</td><td>+</td><td>.</td><td> </td></tr> 045 * <tr><td>SEQ2</td><td>grail</td><td>ATG</td><td>17</td><td>19</td><td>2.1</td><td>-</td><td>0</td><td> </td></tr> 046 * <tr><td>seq1</td><td>BLASTX</td><td>similarity</td><td>101</td><td>235</td><td>87.1</td><td>+</td><td>0</td><td>Target "HBA_HUMAN" 11 55 ; E_value 0.0003</td></tr> 047 * <tr><td>dJ102G20</td><td>GD_mRNA</td><td>coding_exon</td><td>7105</td><td>7201</td><td>.</td><td>-</td><td>2</td><td>Sequence "dJ102G20.C1.1"</td></tr> 048 * <tr><td>X</td><td>gadfly</td><td>exon</td><td>3118</td><td>3280</td><td>.</td><td>-</td><td>.</td><td>genegrp=CG3038; transgrp=CG3038-RB; name=CG3038:1</td></tr> 049 * <tr><td>X</td><td>gadfly</td><td>exon</td><td>2850</td><td>3016</td><td>.</td><td>-</td><td>.</td><td>genegrp=CG3038; transgrp=CG3038-RB; name=CG3038:2</td></tr> 050 * </table> 051 * 052 * @author Stephen Fisher 053 * @version $Id: GFFTrack.java,v 1.1.2.23 2007/02/22 21:10:27 fisher Exp $ 054 */ 055 056 public class GFFTrack implements TrackFile { 057 058 private final int ID = FileIO.GFF; 059 private final String DESC = "GFF files (*.gff)"; 060 private final String[] EXT = {".gff"}; 061 private final FileFilter fileFilter = new GFFFilter(); 062 063 //-------------------------------------------------------------------------- 064 // Setters and Getters 065 066 public int getID() { return ID; } 067 068 public String getDesc() { return DESC; } 069 070 public String[] getExt() { return EXT; } 071 072 public FileFilter getFileFilter() { return fileFilter; } 073 074 //-------------------------------------------------------------------------- 075 // Miscellaneous Methods 076 077 /** 078 * Load all Features in the GFF file into a single Track and 079 * return the resulting Track object. 080 */ 081 public Track load(String filename) { 082 return load(filename, ""); 083 } 084 085 /** 086 * Load all Features in the GFF file into a single Track and 087 * return the resulting Track object. 088 * 089 * @XXX need to throw FileIO exceptions, rather than just print 090 * errors. 091 */ 092 public Track load(String filename, String sourceID) { 093 // when creating Track's ID, if necessary, remove ".gff" 094 // filename extension. Use File() to remove any path info 095 // from the filename so that the ID is just the name. 096 File file = new File(filename); 097 String id = file.getName(); 098 if (id.endsWith(".gff")) id = id.substring(0, id.length()-4); 099 100 101 Track track = new Track(false, id); 102 Sequence source = null; 103 if (! GloDBUtils.isEmpty(sourceID)) { 104 source = ObjectHandles.getSequence(sourceID); 105 if (source == null) { 106 String msg = "The source ID \"" + sourceID + "\" isn't valid."; 107 msg += "Source IDs will be set from the feature headers."; 108 GloDBUtils.printMsg(msg, GloDBUtils.WARNING); 109 } 110 } 111 112 try { 113 BufferedReader bReader = new BufferedReader(new FileReader(file)); 114 String line; 115 116 StringBuffer attributes = new StringBuffer(); 117 while ((line = bReader.readLine()) != null) { 118 line = line.trim(); 119 120 // skip all comment ('#') lines and lines that are 121 // empty. Don't use GloDBUtils.isEmpty() because it's 122 // redundent processing (trims and tests for null). 123 if ((! line.startsWith("#")) && (line.length() > 0)) { 124 // split line at every tab ('\t') 125 String[] fields = line.split(" "); 126 127 // get a reference to the Sequence for this 128 // Feature. If source already exists then use 129 // that, else try the first field ('seqName'). 130 Sequence seqRef; 131 if (source == null) { 132 seqRef = ObjectHandles.getSequence(fields[0]); 133 if (seqRef == null) { 134 if (true) { 135 seqRef = new Sequence(fields[0]); 136 GloDBUtils.printWarning("Sequence not found, so created empty Sequence with ID: " 137 + fields[0]); 138 } else { 139 GloDBUtils.printError("Skipping feature because sequence not found: " 140 + fields[0]); 141 continue; 142 } 143 } 144 } else { 145 seqRef = source; 146 } 147 148 // create a new Feature object 149 Feature feature = new ExactFeature(Integer.parseInt(fields[3]), 150 Integer.parseInt(fields[4]), seqRef); 151 152 // get Feature attributes 153 attributes.setLength(0); // erase buffer 154 attributes.append("source=" + fields[1]); // get source 155 attributes.append(";feature=" + fields[2]); // get feature label 156 attributes.append(";score=" + fields[5]); // get score 157 attributes.append(";strand=" + fields[6]); // get strand 158 attributes.append(";frame=" + fields[7]); // get frame 159 if (fields.length > 8) { // get attributes 160 // this will contain tag/value pairs. Since we 161 // use ';' as key/value delimiter, we need to 162 // make sure fields[8] doesn't also contain 163 // this delimiter. 164 attributes.append(";attributes=" + fields[8].replace(';', ',')); 165 /* 166 StringTokenizer tokens = new StringTokenizer(fields[8], ";"); 167 168 while (tokens.hasMoreTokens()) { 169 String attrib = tokens.nextToken().trim(); 170 String[] key_value = attrib.split(" ", 2); 171 172 if (key_value.length > 1) { 173 attributes.put(key_value[0], key_value[1]); 174 } else { 175 // test if uses '=' as delimiter 176 // instead of ' ' 177 key_value = attrib.split("=", 2); 178 if (key_value.length > 1) { 179 attributes.put(key_value[0], key_value[1]); 180 } else { 181 // still can't parse the 182 // attributes, so add all 183 // attributes as one item 184 attributes.put("attributes", fields[8]); 185 break; 186 } 187 } 188 } 189 */ 190 } 191 feature.setAttributes(attributes.toString()); 192 193 // add the Feature object to the Track 194 track.addFeature(feature); 195 } 196 } 197 198 bReader.close(); 199 } catch (FileNotFoundException e) { 200 GloDBUtils.printError("File not found: " + e.getMessage()); 201 return null; 202 } catch (IOException e) { 203 GloDBUtils.printError("Error reading file: " + filename); 204 return null; 205 } 206 207 if (track.numFeatures() == 0) { 208 // this assumes an empty Track is a mistake, so return null 209 GloDBUtils.printError("Unable to load any features from the file: " + filename); 210 return null; 211 } 212 213 // add track to trackPool 214 try { 215 ObjectHandles.addTrack(track); 216 } catch (InvalidIDException e) { 217 String id_new = Track.randomID("_T"); 218 String msg = "ID \"" + track.getID() + "\" already exists, using ID \"" + id_new + "\" instead."; 219 GloDBUtils.printWarning(msg); 220 221 // add self to set of all Tracks, using new ID 222 track.setID(id_new, false); 223 ObjectHandles.addTrack(track); 224 } 225 226 GloDBUtils.printMsg("Loaded GFF file: " + filename); 227 return track; 228 } 229 230 /** 231 * Save the Track to a file based on it's ID. This will overwrite 232 * any existing file. This will append ".gff" to the filename. 233 */ 234 public void save(String id) { 235 // add ".gff" filename extension, if necessary 236 String filename = id; 237 if (! filename.endsWith(".gff")) filename += ".gff"; 238 239 save(id, filename, true); 240 } 241 242 /** 243 * Save all Features in a GFF file. 244 * 245 * @XXX need to throw FileIO exceptions, rather than just print 246 * errors. 247 * @XXX Should offer option to include Sequence data. 248 */ 249 public void save(String id, String filename, boolean overwrite) { 250 // if empty filename, then exit 251 if (filename.length() == 0) return; 252 253 // add ".gff" filename extension, if necessary 254 if (! filename.endsWith(".gff")) filename += ".gff"; 255 256 File file = new File(filename); 257 // if the file already exists and not supposed to overwrite 258 // it, then return on error. 259 if (file.exists() && (! overwrite)) { 260 GloDBUtils.printError("File \"" + filename + "\" already exists."); 261 return; 262 } 263 264 // counters for potential errors in output file 265 int featureLabelErrors = 0; 266 int strandLabelErrors = 0; 267 268 try { 269 Track track = ObjectHandles.getTrack(id); 270 if (track == null) { 271 GloDBUtils.printError("Track \"" + id + "\" doesn't exist."); 272 return; 273 } 274 275 FileWriter fWriter = new FileWriter(file); 276 BufferedWriter bWriter = new BufferedWriter(fWriter); 277 278 for (Iterator i = track.featureIterator(); i.hasNext();) { 279 Feature feature = (Feature) i.next(); 280 281 // create a copy of the attributes so we can remove 282 // objects from the HashMap as we process them below 283 HashMap attribs = feature.getAttributesMap(); 284 285 // add sequence ID 286 String line = feature.getSourceID(); 287 288 // add source info 289 if (attribs.containsKey("source")) { 290 line += "\t" + attribs.get("source"); 291 attribs.remove("source"); 292 } else { 293 // if no source attribute then we are the source 294 line += "\tGloDB"; 295 } 296 297 // add feature label 298 if (attribs.containsKey("feature")) { 299 line += "\t" + attribs.get("feature"); 300 attribs.remove("feature"); 301 } else { 302 // if no feature attribute then use the track ID. 303 // XXX this is probably not correct 304 line += "\t" + id; 305 featureLabelErrors++; 306 } 307 308 // add start/stop info 309 line += "\t" + feature.getStart(); 310 line += "\t" + feature.getStop(); 311 312 // add score info 313 if (attribs.containsKey("score")) { 314 line += "\t" + attribs.get("score"); 315 attribs.remove("score"); 316 } else { 317 // if no score attribute then use '.' 318 line += "\t."; 319 } 320 321 // add strand info 322 if (attribs.containsKey("strand")) { 323 line += "\t" + attribs.get("strand"); 324 attribs.remove("strand"); 325 } else { 326 // if no strand attribute then use '.' 327 // XXX this is probably not correct 328 line += "\t+"; 329 strandLabelErrors++; 330 } 331 332 // add frame info 333 if (attribs.containsKey("frame")) { 334 line += "\t" + attribs.get("frame"); 335 attribs.remove("frame"); 336 } else { 337 // if no frame attribute then use '.' 338 line += "\t."; 339 } 340 341 // add attributes info 342 if (attribs.containsKey("attributes")) { 343 line += "\t" + attribs.get("attributes"); 344 attribs.remove("attributes"); 345 } 346 347 // add remaining attributes 348 for (Iterator l = (attribs.keySet()).iterator(); l.hasNext();) { 349 String key = (String) l.next(); 350 line += "; " + key + " " + attribs.get(key); 351 } 352 353 bWriter.write(line); 354 bWriter.newLine(); 355 } 356 357 bWriter.flush(); 358 bWriter.close(); 359 } catch (FileNotFoundException e) { 360 // problem with FileOutputStream 361 GloDBUtils.printError("File \"" + filename + "\" can not be opened."); 362 } catch (IOException e) { 363 // problem with ObjectOutputStream. XXX do we need to 364 // close bWriter()? 365 GloDBUtils.printError("Error writting output file \"" + filename + "\"."); 366 } 367 368 if (featureLabelErrors > 0) { 369 String msg = "Number of feature labels not found: " + featureLabelErrors + "\n"; 370 msg += " Used \"" + id + "\" instead."; 371 GloDBUtils.printError(msg); 372 } 373 374 if (strandLabelErrors > 0) { 375 String msg = "Strand attribute not found: " + strandLabelErrors + "\n"; 376 msg += " Used \"+\" instead."; 377 GloDBUtils.printError(msg); 378 } 379 } 380 381 /** Format all Features into a GFF like string. */ 382 public String format(String id) { 383 // counters for potential errors in output file 384 int featureLabelErrors = 0; 385 int strandLabelErrors = 0; 386 387 Track track = ObjectHandles.getTrack(id); 388 if (track == null) { 389 GloDBUtils.printError("Not a valid track"); 390 return ""; 391 } 392 393 String out = ""; 394 395 for (Iterator i = track.featureIterator(); i.hasNext();) { 396 Feature feature = (Feature) i.next(); 397 398 // create a copy of the attributes so we can remove 399 // objects from the HashMap as we process them below 400 HashMap attribs = feature.getAttributesMap(); 401 402 // add sequence ID 403 String line = feature.getSourceID(); 404 405 // add source info 406 if (attribs.containsKey("source")) { 407 line += "\t" + attribs.get("source"); 408 attribs.remove("source"); 409 } else { 410 // if no source attribute then we are the source 411 line += "\tGloDB"; 412 } 413 414 // add feature label 415 if (attribs.containsKey("feature")) { 416 line += "\t" + attribs.get("feature"); 417 attribs.remove("feature"); 418 } else { 419 // if no feature attribute then use the track ID. 420 // XXX this is probably not correct 421 line += "\t" + id; 422 featureLabelErrors++; 423 } 424 425 // add start/stop info 426 line += "\t" + feature.getStart(); 427 line += "\t" + feature.getStop(); 428 429 // add score info 430 if (attribs.containsKey("score")) { 431 line += "\t" + attribs.get("score"); 432 attribs.remove("score"); 433 } else { 434 // if no score attribute then use '.' 435 line += "\t."; 436 } 437 438 // add strand info 439 if (attribs.containsKey("strand")) { 440 line += "\t" + attribs.get("strand"); 441 attribs.remove("strand"); 442 } else { 443 // if no strand attribute then use '.' 444 // XXX this is probably not correct 445 line += "\t+"; 446 strandLabelErrors++; 447 } 448 449 // add frame info 450 if (attribs.containsKey("frame")) { 451 line += "\t" + attribs.get("frame"); 452 attribs.remove("frame"); 453 } else { 454 // if no frame attribute then use '.' 455 line += "\t."; 456 } 457 458 // add attributes info 459 if (attribs.containsKey("attributes")) { 460 line += "\t" + attribs.get("attributes"); 461 attribs.remove("attributes"); 462 } 463 464 // add remaining attributes 465 for (Iterator l = (attribs.keySet()).iterator(); l.hasNext();) { 466 String key = (String) l.next(); 467 line += "; " + key + " " + attribs.get(key); 468 } 469 470 out += line + "\n"; 471 } 472 473 if (featureLabelErrors > 0) { 474 String msg = "Number of feature labels not found: " + featureLabelErrors + "\n"; 475 msg += " Used \"" + id + "\" instead."; 476 GloDBUtils.printError(msg); 477 } 478 479 if (strandLabelErrors > 0) { 480 String msg = "Strand attribute not found: " + strandLabelErrors + "\n"; 481 msg += " Used \"+\" instead."; 482 GloDBUtils.printError(msg); 483 } 484 485 return out; 486 } 487 488 /** 489 * GFF specific FileFilter. 490 * @XXX This should use EXT. 491 */ 492 private class GFFFilter extends FileFilter { 493 public boolean accept(File f) { 494 // accept directories 495 if (f.isDirectory()) return true; 496 497 // if true, then don't filter by file extensions. 498 if (GUIUtils.showAllFiles()) return true; 499 500 // accept files ending in '.gff' 501 if ((f.getName()).endsWith(".gff")) return true; 502 503 return false; 504 } 505 506 // set the filter's description 507 public String getDescription() { return DESC; } 508 } 509 510 } // GFFTrack.java 511 512