001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)Sequence.java 021 */ 022 023 package edu.upenn.gloDB; 024 025 import java.util.HashMap; 026 import java.util.Random; 027 028 /** 029 * Sequence. 030 * 031 * @author Stephen Fisher 032 * @version $Id: Sequence.java,v 1.30.2.20 2007/03/01 21:17:33 fisher Exp $ 033 */ 034 035 public class Sequence { 036 037 /** 038 * This is the number of characters to print out per line when 039 * formatting the output in getDataFormatted(). 040 * @XXX This should probably be a user adjustable parameter. 041 */ 042 public static int FORMAT_WIDTH = 60; 043 044 /** 045 * When true then sequence data will be stored in the Sequence 046 * object in a compressed format. When false, the data will be 047 * stored as a String object. 048 * @XXX This should probably be a user adjustable parameter. 049 */ 050 public static boolean USE_COMPRESSION = false; 051 052 /** 053 * This is a unique name for the sequence, that is used by the 054 * parser to identify the sequence. This can not be changed 055 * to preserve Feature references. 056 */ 057 private String id; 058 059 /** 060 * This is the object that will handle getting the data for this 061 * Sequence. The object referenced by dataLoader will use the 062 * values in 'loaderArgs' and return the data as a String. 063 */ 064 private SequenceLoader dataLoader = null; 065 066 /** 067 * This is a map of key:value pairs needed to load the data from 068 * the data source, as defined by 'dataLoader': URL, file, database, 069 * etc. 070 */ 071 private HashMap loaderArgs = new HashMap(); 072 073 /** 074 * This is a flag for whether data has been loaded. It's possible 075 * that data was 'loaded' from a source that returned an empty 076 * string. 077 */ 078 private boolean dataLoaded = false; 079 080 /** 081 * This is the starting position for this Sequence on the 082 * chromosome. If the Sequence is a chromosome, then offset will 083 * be 0. 084 */ 085 private int offset; 086 087 /** 088 * Metadata related to the sequence. ex: source, locus, accession 089 * no., version, GI, protein_id. Should any of these be hardcoded 090 * as fields? 091 */ 092 private HashMap attributes = new HashMap(); 093 094 /** 095 * The sequence raw data as an unformatted string. The data is 096 * not loaded by default, rather it is loaded when the user 097 * performs an operation that requires the data. Note that 098 * concatination operations should be done on StringBuffer objects 099 * with the results stored as a String, since Strings are 100 * immutable and thus converted to StringBuffers during the 101 * operations. This is particularly important when loading data 102 * from a file which might entail a lot of concatinations. 103 */ 104 private String data = ""; 105 private byte[] cData; 106 private int dataLength = 0; 107 108 /** Used to create random ids. */ 109 private static Random random = new Random(System.currentTimeMillis()); 110 111 /** 112 * Create a new Sequence object and add it to the set of Sequence 113 * objects. 114 */ 115 public Sequence() { 116 this(true, ""); 117 } 118 119 /** 120 * Create a new Sequence object with the specified id, and add it 121 * to the set of Sequence objects. 122 */ 123 public Sequence(String id) { 124 this(true, id); 125 } 126 127 /** 128 * Create a new Sequence object and add the newly created Sequence 129 * object to the set of sequence objects if addToPool is true. 130 * @XXX This should probably be 'protected' instead of 'public' 131 * because all Sequences should really be added to sequencePool. 132 */ 133 public Sequence(boolean addToPool) { 134 this(addToPool, ""); 135 } 136 137 /** 138 * Create a new Sequence object and add the newly created Sequence 139 * object to the set of sequence objects if addToPool is true. 140 * @XXX This should probably be 'protected' instead of 'public' 141 * because all Sequences should really be added to sequencePool. 142 */ 143 public Sequence(boolean addToPool, String id) { 144 // if no ID, then create a random ID for this Sequence 145 if (id == "") id = randomID("_S"); 146 this.id = id; 147 148 if (addToPool) { 149 try { 150 // add self to set of all Sequences 151 ObjectHandles.addSequence(this); 152 } catch (InvalidIDException e) { 153 String id_new = randomID("_S"); 154 String msg = "ID \"" + id 155 + "\" already exists, using ID \"" + id_new + "\" instead."; 156 GloDBUtils.printWarning(msg); 157 158 // add self to set of all Sequences, using new id 159 this.id = id_new; 160 ObjectHandles.addSequence(this); 161 } 162 } 163 } 164 165 //-------------------------------------------------------------------------- 166 // Setters and Getters 167 168 /** Returns Feature type (see GloDBUtils) */ 169 public int getType() { return GloDBUtils.SEQUENCE; } 170 171 /** 172 * Set the ID. If the new ID is the same as the current ID, then 173 * doesn't do anything. If the new ID already exists in the 174 * sequencePool, then throws an exception. 175 * @param id a String that is a unique identifier for the sequence. 176 */ 177 /* 178 public void setID(String id) throws InvalidIDException { 179 try { setID(id, true); } 180 catch (InvalidIDException e) { throw e; } 181 } 182 */ 183 184 /** 185 * Set the ID. If the new ID is the same as the current ID, then 186 * doesn't do anything. If the new ID already exists in the 187 * sequencePool, then throws an exception. If 'updatePool' is 188 * true, then the sequencePool is updated. 'updatePool' must be 189 * true if the Sequence is in the sequencePool, else the sequencePool 190 * will become out of sync. 191 * @param id a String that is a unique identifier for the sequence. 192 */ 193 /* 194 public void setID(String id, boolean updatePool) throws InvalidIDException { 195 // don't do anything if new and old values are the same 196 if (this.id == id) return; 197 198 if (updatePool) { 199 // renameSequence() will do the actual changing of the 200 // Sequence's id. 201 try { ObjectHandles.renameSequence(this, id); } 202 catch (InvalidIDException e) { throw e; } 203 } else { 204 // since not in sequencePool, just change ID 205 this.id = id; 206 } 207 } 208 */ 209 210 /** Get the id. */ 211 public String getID() { return id; } 212 213 /** Set the Sequence source parser. */ 214 public void setDataLoader(SequenceLoader dataLoader) { this.dataLoader = dataLoader; } 215 216 /** Returns the parser for the Sequence source. */ 217 public SequenceLoader getDataLoader() { return dataLoader; } 218 219 /** Set the sequence loaderArgs. */ 220 public void setLoaderArgs(HashMap loaderArgs) { this.loaderArgs = loaderArgs; } 221 222 /** Get the sequence loaderArgs. */ 223 public HashMap getLoaderArgs() { return loaderArgs; } 224 225 /** Add a sequence parserArg. */ 226 public void addLoaderArg(Object key, Object value) { loaderArgs.put(key, value); } 227 228 /** Get a sequence parserArg. */ 229 public Object getLoaderArg(Object key) { return loaderArgs.get(key); } 230 231 /** Returns true if data was loaded. */ 232 public boolean isDataLoaded() { return dataLoaded; } 233 234 /** Set the Sequence starting position on the chromosome. */ 235 public void setOffset(int offset) { this.offset = offset; } 236 237 /** Returns the Sequence starting position on the chromosome. */ 238 public int getOffset() { return offset; } 239 240 /** 241 * Set the Sequence data, expecting a single unformatted string. 242 * This will set the dataLoaded flag to 'true'. 243 */ 244 public void setData(String data) { 245 if (GloDBUtils.isEmpty(data)) { 246 // no data so remove stored data 247 this.dataLength = 0; 248 this.data = ""; 249 this.cData = null; 250 dataLoaded = false; 251 252 } else { 253 this.dataLength = data.length(); 254 if (USE_COMPRESSION) { 255 this.cData = GloDBUtils.compressString(data); 256 } else { 257 this.data = data; 258 } 259 dataLoaded = true; 260 } 261 } 262 263 /** Returns the Sequence data as a single unformatted string. */ 264 public String getData() { 265 // loadData() returns "" if data already loaded 266 String locData = loadData(); 267 268 if (GloDBUtils.isEmpty(locData)) { 269 // data already loaded 270 if (USE_COMPRESSION) { 271 // uncompress data 272 if (cData == null) return ""; 273 else return GloDBUtils.uncompressString(cData); 274 } else { 275 return this.data; 276 } 277 } else { 278 return locData; 279 } 280 } 281 282 /** Set the sequence attributes. */ 283 public void setAttributes(HashMap attributes) { 284 // make sure attributes is never set to null 285 if (attributes == null) attributes = new HashMap(); 286 this.attributes = attributes; 287 } 288 289 /** Get the sequence attributes. */ 290 public HashMap getAttributes() { return attributes; } 291 292 //-------------------------------------------------------------------------- 293 // Miscellaneous Methods 294 295 /** Add a sequence attribute. */ 296 public void addAttribute(Object key, Object value) { attributes.put(key, value); } 297 298 /** Remove an attribute. */ 299 public void delAttribute(Object key) { attributes.remove(key); } 300 301 /** Returns true if attribute 'key' exists. */ 302 public boolean containsAttribute(Object key) { return attributes.containsKey(key); } 303 304 /** Get a sequence attribute. */ 305 public Object getAttribute(Object key) { return attributes.get(key); } 306 307 /** 308 * This will load the data from 'dataLoader' if overwriting the 309 * current value of data. If dataLoader is null, then this won't 310 * do anything. 311 */ 312 public void reloadData() { 313 if (dataLoader != null) { 314 // attempt to get data from dataLoader 315 setData(dataLoader.getData(loaderArgs)); 316 } 317 } 318 319 /** 320 * This will load the data from 'dataLoader' if data is currently 321 * empty. If data is not empty, then this won't do anything. 322 * This method is called internally whenever data is used, so the 323 * user should never need to call this method. We return the 324 * uncompressed data, because in some instances, the method 325 * calling loadData() requires uncompressed data. If we use 326 * setData() and getData(), then the data will be compressed and 327 * then uncompressed. 328 */ 329 public String loadData() { 330 if ((! isDataLoaded()) && (dataLoader != null) && (dataLength == 0)) { 331 // data is empty so attempt to get data from dataLoader. 332 String data = dataLoader.getData(loaderArgs); 333 setData(data); 334 return data; 335 } 336 337 return ""; 338 } 339 340 /** 341 * Returns the length of the data string. If the dataLoader isn't 342 * set and thus no data is loaded, then will return -1. 343 */ 344 public int length() { 345 loadData(); // make sure data is loaded before using data 346 347 // if data still not loaded, then return -1 348 if (isDataLoaded()) return dataLength; 349 else return -1; 350 } 351 352 /** 353 * Returns the initial position of the Sequence on the chromosome. 354 * This will return the same value as getOffset(). 355 */ 356 public int getMin() { return offset; } 357 358 /** 359 * Returns the maximum position of the Sequence on the chromosome. 360 * If the dataLoader isn't set and thus no data is loaded, then 361 * will return -1. 362 */ 363 public int getMax() { 364 // if no data and dataLoader not set then flag this by 365 // returning a length of -1 366 if ((! isDataLoaded()) && (dataLoader == null)) return -1; 367 368 loadData(); // make sure data is loaded before using data 369 return offset + dataLength; 370 } 371 372 /** 373 * Returns 'true' if the position 'pos' is contained in this 374 * Sequence object. 375 */ 376 public boolean contains(int pos) { 377 if ((pos >= offset) && (pos <= getMax())) return true; 378 else return false; 379 } 380 381 /** 382 * Returns 'true' if 'feature' is contained in this Sequence 383 * object. This will return 'false' if the Feature's source ID 384 * doesn't match this Sequence's ID. 385 */ 386 public boolean contains(Feature feature) { 387 if (feature.getSourceID() != id) return false; 388 389 if ((feature.getMin() >= offset) && (feature.getMax() <= getMax())) { 390 return true; 391 } else { 392 return false; 393 } 394 } 395 396 /** 397 * Returns the sequence data between position '(min-1)' and 398 * position 'max'. Goes from ((min-1) to max) because java 399 * Strings go from (0 to (length-1)) and the actual position data 400 * assumes (1 to length) 401 * @param min the starting position 402 * @param max the ending position 403 */ 404 public String getDataBounded(int min, int max) { 405 // this will load the data if necessary 406 String data = getData(); 407 408 if (dataLength > 0) { 409 // if offset = 0, then not set so need to adjust for 410 // sequence starting at 1 and String starting at 0. If 411 // offset is set, then 412 if (offset == 0) { 413 min -= 1; 414 } else { 415 min = min - offset; 416 max = (max - offset) + 1; 417 } 418 // return data.substring(min-1, max); 419 return data.substring(min, max); 420 } else { 421 return ""; 422 } 423 } 424 425 /** 426 * Returns the bounded sequence data with "\n" inserted every 427 * FORMAT_WIDTH characters. 428 */ 429 public String getDataBoundedFormatted(int min, int max) { 430 StringBuffer out = new StringBuffer(""); 431 432 String tmp = getDataBounded(min, max); 433 int total = tmp.length(); 434 int i = FORMAT_WIDTH; 435 while (i < total) { 436 out.append(tmp.substring(i - FORMAT_WIDTH, i) + "\n"); 437 i += FORMAT_WIDTH; 438 } 439 if (i >= total) out.append(tmp.substring(i - FORMAT_WIDTH, total)); 440 441 return out.toString(); 442 } 443 444 /** 445 * Returns the sequence data with "\n" inserted every FORMAT_WIDTH 446 * characters. 447 */ 448 public String getDataFormatted() { 449 // this will load the data if necessary 450 String data = getData(); 451 452 StringBuffer out = new StringBuffer(""); 453 int i = FORMAT_WIDTH; 454 while (i < dataLength) { 455 out.append(data.substring(i - FORMAT_WIDTH, i) + "\n"); 456 i += FORMAT_WIDTH; 457 } 458 if (i >= dataLength) out.append(data.substring(i - FORMAT_WIDTH, dataLength)); 459 460 return out.toString(); 461 } 462 463 /* 464 * Uses 'base' to create a random ID string that doesn't already 465 * exist in the sequencePool. 466 */ 467 public static String randomID(String base) { 468 String id = base + Long.toString(Math.abs(random.nextLong())); 469 while (ObjectHandles.sequencePool.containsKey(id)) { 470 id = base + Long.toString(Math.abs(random.nextLong())); 471 } 472 return id; 473 } 474 475 /** 476 * Returns attributes information. The data isn't included here. 477 * To get the data use {@link #getData() getData()} or {@link 478 * #getDataFormatted() getDataFormatted()}. 479 */ 480 public String toString() { 481 String out = ""; 482 483 out += "ID: " + id + "\n"; 484 out += "Offset: " + offset + "\n"; 485 486 if ((attributes == null) || attributes.isEmpty()) { 487 out += "Attributes: none"; 488 } else { 489 out += "Attributes:\n " + attributes; // will convert itself to a string 490 } 491 492 if (dataLength > 0) { 493 out += "\nSequence length: " + dataLength; 494 } else if (dataLoader == null) { 495 out += "\nSequence length: dataLoader not set"; 496 } else { 497 out += "\nSequence length: data not yet loaded"; 498 } 499 500 return out; 501 } 502 503 } // Sequence.java