001    /*
002     * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of
003     * Pennsylvania.
004     *
005     * This file is part of Glo-DB.
006     * 
007     * Glo-DB is free software: you can redistribute it and/or modify it
008     * under the terms of the GNU General Public License as published by
009     * the Free Software Foundation, either version 3 of the License, or
010     * (at your option) any later version.
011     * 
012     * Glo-DB is distributed in the hope that it will be useful, but
013     * WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * General Public License for more details.
016     * 
017     * You should have received a copy of the GNU General Public License
018     * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>.
019     *
020     * @(#)Sequence.java
021     */
022    
023    package edu.upenn.gloDB;
024    
025    import java.util.HashMap;
026    import java.util.Random;
027    
028    /**
029     * Sequence.
030     *
031     * @author  Stephen Fisher
032     * @version $Id: Sequence.java,v 1.30.2.20 2007/03/01 21:17:33 fisher Exp $
033     */
034    
035    public class Sequence { 
036    
037             /** 
038              * This is the number of characters to print out per line when
039              * formatting the output in getDataFormatted().
040              * @XXX This should probably be a user adjustable parameter.
041              */
042             public static int FORMAT_WIDTH = 60;
043    
044             /** 
045              * When true then sequence data will be stored in the Sequence
046              * object in a compressed format.  When false, the data will be
047              * stored as a String object.
048              * @XXX This should probably be a user adjustable parameter.
049              */
050             public static boolean USE_COMPRESSION = false;
051    
052             /** 
053              * This is a unique name for the sequence, that is used by the
054              * parser to identify the sequence.  This can not be changed
055              * to preserve Feature references.
056              */
057             private String id;
058    
059             /** 
060              * This is the object that will handle getting the data for this
061              * Sequence.  The object referenced by dataLoader will use the
062              * values in 'loaderArgs' and return the data as a String.
063              */
064             private SequenceLoader dataLoader = null;
065    
066             /**
067              * This is a map of key:value pairs needed to load the data from
068              * the data source, as defined by 'dataLoader': URL, file, database,
069              * etc.
070              */
071             private HashMap loaderArgs = new HashMap();
072    
073             /** 
074              * This is a flag for whether data has been loaded.  It's possible
075              * that data was 'loaded' from a source that returned an empty
076              * string.
077              */
078             private boolean dataLoaded = false;
079    
080             /** 
081              * This is the starting position for this Sequence on the
082              * chromosome.  If the Sequence is a chromosome, then offset will
083              * be 0.
084              */
085             private int offset;
086    
087             /** 
088              * Metadata related to the sequence.  ex: source, locus, accession
089              * no., version, GI, protein_id.  Should any of these be hardcoded
090              * as fields?
091              */
092             private HashMap attributes = new HashMap();
093    
094             /** 
095              * The sequence raw data as an unformatted string.  The data is
096              * not loaded by default, rather it is loaded when the user
097              * performs an operation that requires the data.  Note that
098              * concatination operations should be done on StringBuffer objects
099              * with the results stored as a String, since Strings are
100              * immutable and thus converted to StringBuffers during the
101              * operations.  This is particularly important when loading data
102              * from a file which might entail a lot of concatinations.
103              */
104             private String data = "";
105             private byte[] cData;
106             private int dataLength = 0;
107    
108             /** Used to create random ids. */
109        private static Random random = new Random(System.currentTimeMillis());
110    
111             /** 
112              * Create a new Sequence object and add it to the set of Sequence
113              * objects.
114              */
115             public Sequence() { 
116                      this(true, "");
117             }
118    
119             /** 
120              * Create a new Sequence object with the specified id, and add it
121              * to the set of Sequence objects.
122              */
123             public Sequence(String id) { 
124                      this(true, id);
125             }
126    
127             /** 
128              * Create a new Sequence object and add the newly created Sequence
129              * object to the set of sequence objects if addToPool is true.
130              * @XXX This should probably be 'protected' instead of 'public'
131              * because all Sequences should really be added to sequencePool.
132              */
133             public Sequence(boolean addToPool) {
134                      this(addToPool, "");
135             }
136    
137             /** 
138              * Create a new Sequence object and add the newly created Sequence
139              * object to the set of sequence objects if addToPool is true.
140              * @XXX This should probably be 'protected' instead of 'public'
141              * because all Sequences should really be added to sequencePool.
142              */
143             public Sequence(boolean addToPool, String id) {
144                      // if no ID, then create a random ID for this Sequence
145                      if (id == "") id = randomID("_S");
146                      this.id = id;
147    
148                      if (addToPool) { 
149                                    try {
150                                             // add self to set of all Sequences
151                                             ObjectHandles.addSequence(this);
152                                    } catch (InvalidIDException e) {
153                                             String id_new = randomID("_S");
154                                             String msg = "ID \"" + id
155                                                      + "\" already exists, using ID \"" + id_new + "\" instead.";
156                                             GloDBUtils.printWarning(msg);
157                                             
158                                             // add self to set of all Sequences, using new id
159                                             this.id = id_new;
160                                             ObjectHandles.addSequence(this);
161                                    }
162                      }
163             }
164    
165        //--------------------------------------------------------------------------
166        // Setters and Getters
167       
168             /** Returns Feature type (see GloDBUtils) */
169             public int getType() { return GloDBUtils.SEQUENCE; }
170    
171        /** 
172              * Set the ID.  If the new ID is the same as the current ID, then
173              * doesn't do anything.  If the new ID already exists in the
174              * sequencePool, then throws an exception.
175              * @param id a String that is a unique identifier for the sequence.
176              */
177             /*
178             public void setID(String id) throws InvalidIDException { 
179                      try { setID(id, true); } 
180                      catch (InvalidIDException e) { throw e; }
181             }
182             */
183    
184        /** 
185              * Set the ID.  If the new ID is the same as the current ID, then
186              * doesn't do anything.  If the new ID already exists in the
187              * sequencePool, then throws an exception.  If 'updatePool' is
188              * true, then the sequencePool is updated.  'updatePool' must be
189              * true if the Sequence is in the sequencePool, else the sequencePool
190              * will become out of sync.
191              * @param id a String that is a unique identifier for the sequence.
192              */
193             /*
194             public void setID(String id, boolean updatePool) throws InvalidIDException { 
195                      // don't do anything if new and old values are the same
196                      if (this.id == id) return;
197    
198                      if (updatePool) {
199                                    // renameSequence() will do the actual changing of the
200                                    // Sequence's id.
201                                    try { ObjectHandles.renameSequence(this, id); }
202                                    catch (InvalidIDException e) { throw e; }
203                      } else {
204                                    // since not in sequencePool, just change ID
205                                    this.id = id;    
206                      }
207             }
208             */
209    
210        /** Get the id. */
211        public String getID() { return id; }
212    
213        /** Set the Sequence source parser. */
214             public void setDataLoader(SequenceLoader dataLoader) { this.dataLoader = dataLoader; }
215    
216        /** Returns the parser for the Sequence source. */
217        public SequenceLoader getDataLoader() { return dataLoader; }
218    
219        /** Set the sequence loaderArgs. */
220        public void setLoaderArgs(HashMap loaderArgs) { this.loaderArgs = loaderArgs; }
221    
222        /** Get the sequence loaderArgs. */
223        public HashMap getLoaderArgs() { return loaderArgs; }
224    
225        /** Add a sequence parserArg. */
226        public void addLoaderArg(Object key, Object value) { loaderArgs.put(key, value); }
227    
228        /** Get a sequence parserArg. */
229        public Object getLoaderArg(Object key) { return loaderArgs.get(key); }
230    
231        /** Returns true if data was loaded. */
232        public boolean isDataLoaded() { return dataLoaded; }
233    
234        /** Set the Sequence starting position on the chromosome. */
235        public void setOffset(int offset) { this.offset = offset; }
236    
237        /** Returns the Sequence starting position on the chromosome. */
238        public int getOffset() { return offset; }
239    
240        /** 
241              * Set the Sequence data, expecting a single unformatted string.
242              * This will set the dataLoaded flag to 'true'.
243              */
244        public void setData(String data) { 
245                      if (GloDBUtils.isEmpty(data)) {
246                                    // no data so remove stored data
247                                    this.dataLength = 0;
248                                    this.data = "";
249                                    this.cData = null;
250                                    dataLoaded = false;
251    
252                      } else {
253                                    this.dataLength = data.length();
254                                    if (USE_COMPRESSION) {
255                                             this.cData = GloDBUtils.compressString(data); 
256                                    } else {
257                                             this.data = data;
258                                    }
259                                    dataLoaded = true;
260                      }
261             }
262    
263        /** Returns the Sequence data as a single unformatted string. */
264        public String getData() { 
265                      // loadData() returns "" if data already loaded
266                      String locData = loadData();
267    
268                      if (GloDBUtils.isEmpty(locData)) {
269                                    // data already loaded
270                                    if (USE_COMPRESSION) {
271                                             // uncompress data
272                                             if (cData == null) return "";
273                                             else return GloDBUtils.uncompressString(cData);
274                                    } else {
275                                             return this.data;
276                                    }
277                      } else {
278                                    return locData;
279                      }
280             }
281    
282        /** Set the sequence attributes. */
283        public void setAttributes(HashMap attributes) { 
284                      // make sure attributes is never set to null
285                      if (attributes == null) attributes = new HashMap();
286                      this.attributes = attributes; 
287             }
288    
289        /** Get the sequence attributes. */
290        public HashMap getAttributes() { return attributes; }
291    
292        //--------------------------------------------------------------------------
293        // Miscellaneous Methods
294    
295        /** Add a sequence attribute. */
296        public void addAttribute(Object key, Object value) { attributes.put(key, value); }
297    
298        /** Remove an attribute. */
299        public void delAttribute(Object key) { attributes.remove(key); }
300    
301        /** Returns true if attribute 'key' exists. */
302        public boolean containsAttribute(Object key) { return attributes.containsKey(key); }
303    
304        /** Get a sequence attribute. */
305        public Object getAttribute(Object key) { return attributes.get(key); }
306    
307             /**
308              * This will load the data from 'dataLoader' if overwriting the
309              * current value of data.  If dataLoader is null, then this won't
310              * do anything.
311              */
312             public void reloadData() {
313                      if (dataLoader != null) {
314                                    // attempt to get data from dataLoader
315                                    setData(dataLoader.getData(loaderArgs));
316                      }
317             }
318    
319             /**
320              * This will load the data from 'dataLoader' if data is currently
321              * empty.  If data is not empty, then this won't do anything.
322              * This method is called internally whenever data is used, so the
323              * user should never need to call this method.  We return the
324              * uncompressed data, because in some instances, the method
325              * calling loadData() requires uncompressed data.  If we use
326              * setData() and getData(), then the data will be compressed and
327              * then uncompressed.
328              */
329             public String loadData() {
330                      if ((! isDataLoaded()) && (dataLoader != null) && (dataLength == 0)) {
331                                    // data is empty so attempt to get data from dataLoader.
332                                    String data = dataLoader.getData(loaderArgs);
333                                    setData(data);
334                                    return data;
335                      }
336    
337                      return "";
338             }
339    
340        /** 
341              * Returns the length of the data string.  If the dataLoader isn't
342              * set and thus no data is loaded, then will return -1.
343              */
344        public int length() { 
345                      loadData();  // make sure data is loaded before using data
346    
347                      // if data still not loaded, then return -1
348                      if (isDataLoaded()) return dataLength;
349                      else return -1; 
350             }
351    
352             /**
353              * Returns the initial position of the Sequence on the chromosome.
354              * This will return the same value as getOffset().
355              */
356             public int getMin() { return offset; }
357    
358             /**
359              * Returns the maximum position of the Sequence on the chromosome.
360              * If the dataLoader isn't set and thus no data is loaded, then
361              * will return -1.
362              */
363             public int getMax() { 
364                      // if no data and dataLoader not set then flag this by
365                      // returning a length of -1
366                      if ((! isDataLoaded()) && (dataLoader == null)) return -1;
367    
368                      loadData();  // make sure data is loaded before using data
369                      return offset + dataLength; 
370             }
371    
372             /**
373              * Returns 'true' if the position 'pos' is contained in this
374              * Sequence object.  
375              */
376             public boolean contains(int pos) { 
377                      if ((pos >= offset) && (pos <= getMax())) return true;
378                      else return false;
379             }
380    
381             /**
382              * Returns 'true' if 'feature' is contained in this Sequence
383              * object.  This will return 'false' if the Feature's source ID
384              * doesn't match this Sequence's ID.
385              */
386             public boolean contains(Feature feature) { 
387                      if (feature.getSourceID() != id) return false;
388    
389                      if ((feature.getMin() >= offset) && (feature.getMax() <= getMax())) {
390                                    return true;
391                      } else {
392                                    return false;
393                      }
394             }
395    
396             /** 
397              * Returns the sequence data between position '(min-1)' and
398              * position 'max'.  Goes from ((min-1) to max) because java
399              * Strings go from (0 to (length-1)) and the actual position data
400              * assumes (1 to length)
401              * @param min the starting position
402              * @param max the ending position
403              */
404             public String getDataBounded(int min, int max) { 
405                      // this will load the data if necessary
406                      String data = getData();  
407    
408                      if (dataLength > 0) {
409                                    // if offset = 0, then not set so need to adjust for
410                                    // sequence starting at 1 and String starting at 0.  If
411                                    // offset is set, then 
412                                    if (offset == 0) {
413                                             min -= 1;
414                                    } else {
415                                             min = min - offset;
416                                             max = (max - offset) + 1;
417                                    }
418                                    //                              return data.substring(min-1, max);
419                                    return data.substring(min, max);
420                      } else {
421                                    return "";
422                      }
423             }
424    
425             /** 
426              * Returns the bounded sequence data with "\n" inserted every
427              * FORMAT_WIDTH characters.
428              */
429             public String getDataBoundedFormatted(int min, int max) {
430                      StringBuffer out = new StringBuffer("");
431    
432                      String tmp = getDataBounded(min, max);
433                      int total = tmp.length();
434                      int i = FORMAT_WIDTH;
435                      while (i < total) {
436                                    out.append(tmp.substring(i - FORMAT_WIDTH, i) + "\n");
437                                    i += FORMAT_WIDTH;
438                      }
439                      if (i >= total) out.append(tmp.substring(i - FORMAT_WIDTH, total));
440    
441                      return out.toString();
442             }
443    
444             /** 
445              * Returns the sequence data with "\n" inserted every FORMAT_WIDTH
446              * characters.
447              */
448             public String getDataFormatted() {
449                      // this will load the data if necessary
450                      String data = getData();  
451    
452                      StringBuffer out = new StringBuffer("");
453                      int i = FORMAT_WIDTH;
454                      while (i < dataLength) {
455                                    out.append(data.substring(i - FORMAT_WIDTH, i) + "\n");
456                                    i += FORMAT_WIDTH;
457                      }
458                      if (i >= dataLength) out.append(data.substring(i - FORMAT_WIDTH, dataLength));
459    
460                      return out.toString();
461             }
462    
463             /*
464              * Uses 'base' to create a random ID string that doesn't already
465              * exist in the sequencePool.
466              */
467             public static String randomID(String base) {
468                      String id = base + Long.toString(Math.abs(random.nextLong()));
469                      while (ObjectHandles.sequencePool.containsKey(id)) {
470                                    id = base + Long.toString(Math.abs(random.nextLong()));
471                      }
472                      return id;
473             }
474    
475             /** 
476              * Returns attributes information.  The data isn't included here.
477              * To get the data use {@link #getData() getData()} or {@link
478              * #getDataFormatted() getDataFormatted()}.
479              */
480             public String toString() {
481                      String out = "";
482    
483                      out += "ID: " + id + "\n";
484                      out += "Offset: " + offset + "\n";
485    
486                      if ((attributes == null) || attributes.isEmpty()) {
487                                    out += "Attributes: none";
488                      } else {
489                                    out += "Attributes:\n  " + attributes;  // will convert itself to a string
490                      }
491    
492                      if (dataLength > 0) {
493                                    out += "\nSequence length: " + dataLength;
494                      } else if (dataLoader == null) {
495                                    out += "\nSequence length:  dataLoader not set";
496                      } else {
497                                    out += "\nSequence length:  data not yet loaded";
498                      }
499    
500                      return out;
501             }
502    
503    } // Sequence.java