001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)FASTASequence.java 021 */ 022 023 package edu.upenn.gloDB.io; 024 025 import edu.upenn.gloDB.*; 026 import edu.upenn.gloDB.gui.GUIUtils; 027 import java.io.*; 028 import java.util.HashSet; 029 import java.util.HashMap; 030 import javax.swing.filechooser.FileFilter; 031 032 /** 033 * Import Sequence data from a FASTA file. The basic file format 034 * dictates a header line at the beginning of each sequence. There 035 * are no standards as to what the header line should contain or how 036 * it should be formatted, other than to stipulate that it begins with 037 * a ">". Thus this format sufficient for coding Sequence objects but 038 * not ideal for sequence annotations (Features). Since some sites, 039 * such as www.fruitfly.org, release annotations as FASTA files, some 040 * attempt has been made to parse the headers from specific sites. 041 * Users can use the FASTAParser interface to create their own header 042 * parsers as well. 043 * 044 * @XXX can we assume that the header starts with a Sequence ID? 045 * 046 * @author Stephen Fisher 047 * @version $Id: FASTASequence.java,v 1.31.2.16 2007/03/01 21:17:33 fisher Exp $ 048 */ 049 050 public class FASTASequence implements SequenceFile, SequenceLoader { 051 052 private final int ID = FileIO.FASTA; 053 private final String DESC = "FASTA files (*.fa; *.fas; *.fasta)"; 054 private final String[] EXT = {".fa", ".fas", ".fasta"}; 055 private final FileFilter fileFilter = new FASTAFilter(); 056 057 //-------------------------------------------------------------------------- 058 // Setters and Getters 059 060 public int getID() { return ID; } 061 062 public String getDesc() { return DESC; } 063 064 public String[] getExt() { return EXT; } 065 066 public FileFilter getFileFilter() { return fileFilter; } 067 068 //-------------------------------------------------------------------------- 069 // Miscellaneous Methods 070 071 /** 072 * Return the Sequence data as a String. This assumes a single 073 * Sequence per file. 074 * 075 * @XXX This should allow for FASTA files with more than one 076 * Sequence -- can use headers to find correct data. 077 */ 078 public String getData(HashMap loaderArgs) { 079 String filename = (String) loaderArgs.get("filename"); 080 if (GloDBUtils.isEmpty(filename)) { 081 GloDBUtils.printError("Empty file name, can't load source data."); 082 return ""; 083 } 084 085 try { 086 BufferedReader bReader = new BufferedReader(new FileReader(filename)); 087 088 String line; 089 090 // load sequence header 091 if ((line = bReader.readLine()) != null) { 092 // make sure header exists 093 if (line.startsWith(">")) { 094 GloDBUtils.printMsg("Loading: " + filename); 095 } else { 096 GloDBUtils.printError("File does not contain a header: " + filename); 097 bReader.close(); 098 return null; 099 } 100 } else { 101 GloDBUtils.printError("Empty file: " + filename); 102 bReader.close(); 103 return null; 104 } 105 106 107 int fileLength = (int) (new File(filename)).length(); 108 109 // Send output to the command line because the GUI hangs 110 // during the loading. 111 System.out.print("Working:..."); 112 int cnt = 0; 113 int interval = (fileLength) / 500; 114 if (interval == 0) interval = 1; 115 116 // load sequence data. We use a StringBuffer for the 117 // loading because this entails a lot of concatinations 118 // which are very slow to perform on String objects but 119 // very fast for StringBuffers. 120 StringBuffer sb = new StringBuffer(fileLength); 121 while ((line = bReader.readLine()) != null) { 122 // stop if reach another sequence 123 if (line.startsWith(">")) break; 124 sb.append(line); 125 if ((cnt++ % interval) == 0) System.out.print("."); 126 } 127 System.out.println(""); 128 129 bReader.close(); 130 return sb.toString(); 131 } catch (FileNotFoundException e) { 132 GloDBUtils.printError("File not found: " + e.getMessage()); 133 return null; 134 } catch (IOException e) { 135 GloDBUtils.printError("Error reading file: " + filename); 136 return null; 137 } 138 } 139 140 /** 141 * Load the first sequence in the FASTA file and return the 142 * resulting Sequence object. 143 */ 144 public Sequence load(String filename) { 145 return load(filename, "", new FASTAParserMinimal()); 146 } 147 148 /** 149 * Load the first sequence in the FASTA file and return the 150 * resulting Sequence object. 151 */ 152 public Sequence load(String filename, String id) { 153 return load(filename, id, new FASTAParserMinimal()); 154 } 155 156 /** 157 * Load the first sequence in the FASTA file and return the 158 * resulting Sequence object. 159 * 160 * @XXX need to throw FileIO exceptions, rather than just print 161 * errors. 162 */ 163 public Sequence load(String filename, String id, FASTAParser parser) { 164 File file = new File(filename); 165 166 if (id.length() == 0) { 167 // Need to create Sequence's ID. Remove ".fasta" or ".fas" 168 // or ".fa" filename extension, if present. Use File() to 169 // remove any path info from the filename so that the ID 170 // is just the name. 171 id = file.getName(); 172 if (id.endsWith(".fasta")) id = id.substring(0, id.length()-6); 173 else if (id.endsWith(".fas")) id = id.substring(0, id.length()-4); 174 else if (id.endsWith(".fa")) id = id.substring(0, id.length()-3); 175 } 176 177 Sequence sequence = new Sequence(id); 178 // String data = ""; // store sequence data as read from file. 179 180 try { 181 BufferedReader bReader = new BufferedReader(new FileReader(file)); 182 183 String line; 184 185 // load sequence header 186 if ((line = bReader.readLine()) != null) { 187 // make sure header exists 188 if (line.startsWith(">")) { 189 GloDBUtils.printMsg("Loading: " + filename); 190 sequence.setAttributes(parser.parseHeader(line)); 191 // setup parameters to load data later, if necessary 192 sequence.setDataLoader(this); 193 HashMap loaderArgs = new HashMap(); 194 loaderArgs.put("filename", filename); 195 sequence.setLoaderArgs(loaderArgs); 196 } else { 197 // since didn't correctly load file, remove the 198 // Sequence we just created 199 ObjectHandles.removeSequence(sequence); 200 201 GloDBUtils.printError("File doesn't contain a header: " + filename); 202 bReader.close(); 203 return null; 204 } 205 } else { 206 // since didn't correctly load file, remove the 207 // Sequence we just created 208 ObjectHandles.removeSequence(sequence); 209 210 GloDBUtils.printError("Empty file: " + filename); 211 bReader.close(); 212 return null; 213 } 214 215 /* 216 // load sequence data 217 while ((line = bReader.readLine()) != null) { 218 // stop if reach another sequence 219 if (line.startsWith(">")) break; 220 data += line; 221 } 222 sequence.setData(data); 223 */ 224 225 bReader.close(); 226 } catch (FileNotFoundException e) { 227 GloDBUtils.printError("File not found: " + e.getMessage()); 228 return null; 229 } catch (IOException e) { 230 GloDBUtils.printError("Error reading file: " + filename); 231 return null; 232 } 233 234 return sequence; 235 } 236 237 /** 238 * Load all Sequences in the FASTA file and return a Set 239 * containing the resulting Sequence objects. 240 * 241 * @XXX need to throw FileIO exceptions, rather than just print 242 * errors. 243 */ 244 public HashSet loadAll(String filename) { 245 return loadAll(filename, new FASTAParserMinimal()); 246 } 247 248 /** 249 * Load all Sequences in the FASTA file and return a set 250 * containing the resulting Sequence objects. 251 * 252 * @XXX need to throw FileIO exceptions, rather than just print 253 * errors. 254 */ 255 public HashSet loadAll(String filename, FASTAParser parser) { 256 HashSet sequences = new HashSet(); // set of loaded sequences 257 258 // make sure we have at least a minimal parser 259 if (parser == null) parser = new FASTAParserMinimal(); 260 261 try { 262 BufferedReader bReader = new BufferedReader(new FileReader(filename)); 263 264 String line; 265 boolean firstSequence = true; 266 267 Sequence seq = null; 268 // String data = ""; // store sequence data as read from file. 269 270 while ((line = bReader.readLine()) != null) { 271 // test for a sequence header 272 if (line.startsWith(">")) { 273 if (! firstSequence) { 274 // not first sequence so append existing sequence before 275 // reseting the variables for the next sequence. 276 // seq.setData(data); 277 sequences.add(seq); 278 } else { 279 firstSequence = false; 280 } 281 282 HashMap attributes = parser.parseHeader(line); 283 String id = (String) attributes.get("ID"); 284 if (id == null) id = ""; 285 seq = new Sequence(id); 286 seq.setAttributes(attributes); 287 // setup parameters to load data later, if necessary 288 seq.setDataLoader(this); 289 HashMap loaderArgs = new HashMap(); 290 loaderArgs.put("filename", filename); 291 seq.setLoaderArgs(loaderArgs); 292 293 /* 294 // THIS CODE REMOVED BECAUSE IT SETS ID AFTER CREATING SEQUENCE 295 // CURRENTLY NOT ALLOWING RENAMING OF SEQUENCES 296 // starting a new sequence 297 seq = new Sequence(); 298 seq.setAttributes(parser.parseHeader(line)); 299 // setup parameters to load data later, if necessary 300 seq.setDataLoader(this); 301 HashMap loaderArgs = new HashMap(); 302 loaderArgs.put("filename", filename); 303 seq.setLoaderArgs(loaderArgs); 304 try { 305 String id = (String) seq.getAttribute("ID"); 306 307 if (id != null) seq.setID(id); 308 else seq.setID(Sequence.randomID("_S")); 309 } catch (InvalidIDException e) { 310 String id = Sequence.randomID("_S"); 311 String msg = "WARNING: ID \"" + seq.getAttribute("ID") 312 + "\" already exists, using ID \"" + id + "\" instead."; 313 GloDBUtils.printMsg(msg); 314 seq.setID(id); 315 } 316 */ 317 318 // data = ""; // store sequence data as read from file. 319 } else { 320 // load sequence data 321 // data += line; 322 } 323 } 324 325 if (seq != null) { 326 // add last sequences info 327 // seq.setData(data); 328 sequences.add(seq); 329 } 330 331 bReader.close(); 332 } catch (FileNotFoundException e) { 333 GloDBUtils.printError("File not found: " + e.getMessage()); 334 return null; 335 } catch (IOException e) { 336 // XXX we should probably remove the sequences from the 337 // sequence pool here 338 GloDBUtils.printError("Error reading file: " + filename); 339 return null; 340 } 341 342 return sequences; 343 } 344 345 /** 346 * Save the Seqeuence to a file based on it's ID. This will 347 * overwrite any existing file. This will append ".fasta" to the 348 * filename. 349 */ 350 public void save(String id) { 351 // add ".fasta" filename extension, if necessary 352 String filename = id; 353 if ((! filename.endsWith(".fa")) && (! filename.endsWith(".fas")) 354 && (! filename.endsWith(".fasta"))) { 355 filename += ".fasta"; 356 } 357 358 save(id, filename, true); 359 } 360 361 /** 362 * Save the Sequence data. This will make sure the data is loaded 363 * prior to saving the Sequence. 364 */ 365 public void save(String id, String filename, boolean overwrite) { 366 GloDBUtils.printMsg("Saving 'FASTA' sequence files not yet supported."); 367 } 368 369 public String toString() { return "FASTA File Loader"; } 370 371 /** 372 * FASTA specific FileFilter. 373 * @XXX This should use EXT. 374 */ 375 private class FASTAFilter extends FileFilter { 376 public boolean accept(File f) { 377 // accept directories 378 if (f.isDirectory()) return true; 379 380 // if true, then don't filter by file extensions. 381 if (GUIUtils.showAllFiles()) return true; 382 383 // accept files ending in '.fasta' or '.fas' or '.fa' 384 if ((f.getName()).endsWith(".fasta")) return true; 385 if ((f.getName()).endsWith(".fas")) return true; 386 if ((f.getName()).endsWith(".fa")) return true; 387 388 return false; 389 } 390 391 // set the filter's description 392 public String getDescription() { return DESC; } 393 } 394 395 } // FASTASequence.java 396 397