001    /*
002     * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of
003     * Pennsylvania.
004     *
005     * This file is part of Glo-DB.
006     * 
007     * Glo-DB is free software: you can redistribute it and/or modify it
008     * under the terms of the GNU General Public License as published by
009     * the Free Software Foundation, either version 3 of the License, or
010     * (at your option) any later version.
011     * 
012     * Glo-DB is distributed in the hope that it will be useful, but
013     * WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * General Public License for more details.
016     * 
017     * You should have received a copy of the GNU General Public License
018     * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>.
019     *
020     * @(#)FeatureUtils.java
021     */
022    
023    package edu.upenn.gloDB;
024    
025    import java.util.TreeSet;
026    import java.util.HashMap;
027    import java.util.Iterator;
028    
029    /**
030     * FeatureUtils.  Miscellaneous functions that act on Features and
031     * Sets of Features.
032     *
033     * @author  Stephen Fisher
034     * @version $Id: FeatureUtils.java,v 1.1.2.17 2007/03/01 21:17:32 fisher Exp $
035     */
036    
037    public class FeatureUtils {
038    
039        //--------------------------------------------------------------------------
040        // Miscellaneous Methods
041       
042             /**
043              * Returns a new Feature containing the region in 'feature' that
044              * is bounded by minB/maxB.  Returns null if 'feature' doesn't
045              * overlap with minB/maxB.
046              * @XXX Doesn't deal with multiple Sequences.
047              */
048             public static Feature contained(Feature feature, int minB, int maxB) {
049                      int minA = feature.getMin();
050                      int maxA = feature.getMax();
051    
052                      // check for overlap
053                      if ((minA > maxB) || (maxA < minB)) return null;
054    
055                      int minOut = minA;
056                      int maxOut = maxA;
057                      if (minA < minB) minOut = minB;
058                      if (maxA > maxB) maxOut = maxB;
059    
060                      return new ExactFeature(minOut, maxOut, feature.getSource());
061             }
062    
063             /**
064              * Returns '-1' if 'feature' exists after the integer 'pos',
065              * returns '0' if 'pos' is contained in 'feature', and '1' if
066              * 'pos' occurs after 'feature'.
067              * @XXX This assumes 'pos' is positive within this Feature's
068              * Sequence boundaries.
069              */
070             public static int contains(Feature feature, int pos) {
071                      if (feature.getMin() > pos) { 
072                                    return -1;
073                      } else if (feature.getMax() < pos) { 
074                                    return 1;
075                      } else {
076                                    return 0;
077                      }
078             }
079    
080             /**
081              * Returns 'true' if the second Feature ('featureB') is contained
082              * in the first Feature ('featureA').  If the Features are on
083              * different Sources, then returns 'false'.
084              */
085             public static boolean contains(Feature featureA, Feature featureB) { 
086                      if (featureA.getSource() != featureB.getSource()) return false;
087    
088                      if ((featureA.getMin() > featureB.getMin()) 
089                                    || (featureA.getMax() < featureB.getMax())) {
090                                    return false;
091                      } else {
092                                    return true;
093                      }
094             }
095    
096             /**
097              * Returns 'true' if the Feature and Track overlap.  If the
098              * Features are on different Sources, then returns 'false'.
099              *
100              * f1:         t-------u
101              * aa|------------------------------|bb
102              *   |012345678901234567890123456789|
103              * f2:  a-b   c-d e-f g---h i-j
104              *
105              * f2.max <  f1.min : continue
106              * f2.min <= f1.max : true else false
107              */
108             public static boolean overlaps(Feature featureA, Track track) { 
109                      // make sure the Features are on the same Sequence
110                      TreeSet featuresB = track.featuresBySource(featureA.getSource());
111                      if (featuresB == null) return false;
112    
113                      int minA = featureA.getMin();
114                      int maxA = featureA.getMax();
115    
116                      for (Iterator i = featuresB.iterator(); i.hasNext();) {
117                                    Feature featureB = (Feature) i.next();
118    
119                                    if (featureB.getMax() < minA) {
120                                             // no overlap
121                                             continue;
122                                    } else {
123                                             if (featureB.getMin() <= maxA) {
124                                                      return true;
125                                             } else {
126                                                      return false;
127                                             }
128                                    }
129                      }
130    
131                      // if we got this far then we didn't find an overlap
132                      return false;
133             }
134    
135             /**
136              * Returns 'true' if the Features have overlapping positions.  If
137              * the Features are on different Sources, then returns 'false'.
138              */
139             public static boolean overlaps(Feature featureA, Feature featureB) { 
140                      if (featureA.getSource() != featureB.getSource()) return false;
141    
142                      if ((featureA.getMin() <= featureB.getMax()) && 
143                                    (featureB.getMin() <= featureA.getMax())) {
144                                    // A.min <= B.max and B.min <= A.max
145                                    return true;
146                      } else {
147                                    // no overlap
148                                    return false;
149                      }
150             }
151    
152             /**
153              * Returns the overlapping region between the two Features.  If no
154              * overlap, then null is returned.
155              * @XXX should test for same source 
156              */
157             public static Feature overlap(Feature featureA, Feature featureB) { 
158                      // this will also make sure they are on the same Sequence
159                      if (! overlaps(featureA, featureB)) return null;
160    
161                      int aMin = featureA.getMin();
162                      int aMax = featureA.getMax();
163                      int bMin = featureB.getMin();
164                      int bMax = featureB.getMax();
165    
166                      // we know the Features overlap so we don't need to test for
167                      // that here
168                      if (aMin >= bMin) {
169                                    if (aMax <= bMax) {
170                                             return featureA;
171                                    } else {
172                                             return new ExactFeature(aMin, bMax, featureA.getSource());
173                                    }
174                      } else {  // bMin > aMin
175                                    if (bMax <= aMax) {
176                                             return featureB;
177                                    } else {
178                                             return new ExactFeature(bMin, aMax, featureA.getSource());
179                                    }
180                      }
181             }
182    
183             /**
184              * Compares two Features for order.  Returns '-1', '0', or '1' if
185              * the first Feature ('featureA') is less than, equal to, or
186              * greater than the second Feature ('featureB').  If the Features
187              * have different Source sequences, then they will be sorted by
188              * Source ID.
189              */
190             public static int compareFeatures(Feature featureA, Feature featureB) {
191                      //              int source = featureA.getSourceID().compareToIgnoreCase(featureB.getSourceID());
192                      int source = featureA.getSourceID().compareTo(featureB.getSourceID());
193                      
194                      if (source == 0) { // same source
195                                    // who ever has min is less.
196                                    if (featureA.getMin() < featureB.getMin()) {
197                                             return -1;
198                                    } else if (featureA.getMin() > featureB.getMin()) {
199                                             return 1;
200                                    }
201    
202                                    // min are equal.
203                                    if (featureA.getMax() < featureB.getMax()) {
204                                             return -1;
205                                    } else if (featureA.getMax() > featureB.getMax()) {
206                                             return 1;
207                                    }
208    
209                                    if (GloDBUtils.ignoreAttributes()) {
210                                             // don't use attributes to compare Features, so at
211                                             // this point the Features are the same
212                                             return 0;
213                                    } else {
214                                             // min and max are equal, so return the comparison of
215                                             // the hashCodes for each of the attributes.  If we
216                                             // don't then when 2 Features overlap only one will be
217                                             // included in the Track.
218                                             Integer hashA = new Integer((featureA.getAttributes()).hashCode());
219                                             Integer hashB = new Integer((featureB.getAttributes()).hashCode());
220                                             return hashA.compareTo(hashB);
221                                    }
222                      } else {
223                                    // different sources so sort by source
224                                    return source;
225                      }
226             }
227    
228             /**
229              * Returns 'true' if the set of Features does not contain gaps
230              * between Features.  This assumes that the Features all occur on
231              * the same sequence (ie they refer to the same Sequence object).
232              */
233             public static boolean isContiguous(TreeSet features) { 
234                      // return false if no Features
235                      if ((features == null) || (features.size() == 0)) return false;
236    
237                      Iterator i = features.iterator();
238                      Feature feature = (Feature) i.next();
239                      int end = feature.getMax();
240    
241                      while (i.hasNext()) {
242                                    feature = (Feature) i.next();
243    
244                                    // if find a gap then exit, returning false
245                                    if (feature.getMin() > (end + 1)) return false;
246    
247                                    // still overlapping, so increase the end position if
248                                    // necessary
249                                    if (feature.getMax() > end) end = feature.getMax();
250                      }
251                                    
252                      // if made it this far, then no gaps in features
253                      return true;
254             }
255    
256             /**
257              * This will merge all Features in the Track that are within
258              * maxSpace of each other.  New Features will be created to span
259              * the entire cluster.  Threshold sets the minimum number of
260              * Features necessary to be considered a cluster and thus included
261              * in the output set.  A new Track will be returned containing the
262              * clusters.
263              * @deprecated replaced with cluster.py
264              */
265             public static TreeSet cluster(Track track, int maxSpace, int threshold) { 
266                      TreeSet clusters = new TreeSet();
267    
268                      // if Track is empty then there will be no matches so just
269                      // return an empty set
270                      if (track.numFeatures() == 0) return clusters;
271    
272                      // make sure maxSpace is legal
273                      if (maxSpace < 0) {
274                                    GloDBUtils.printError("Illegal \"maxSpace\" argument in FeatureUtils.cluster.");
275                                    return clusters;
276                      }
277    
278                      // make sure maxSpace is legal
279                      if (threshold < 0) {
280                                    GloDBUtils.printError("Illegal \"threshold\" argument in FeatureUtils.cluster.");
281                                    return clusters;
282                      }
283    
284                      // Source Sets for Track
285                      HashMap sources = track.getSources();
286    
287                      // test Features based on source
288                      for (Iterator s = sources.keySet().iterator(); s.hasNext();) {
289                                    String source = (String) s.next();
290    
291                                    // get Features for this source
292                                    TreeSet features = (TreeSet) sources.get(source);
293    
294                                    Iterator i = features.iterator();
295                                    Feature fLast = (Feature) i.next();
296                                    // max here refers to the range for the beginning of the
297                                    // next Feature
298                                    int max = fLast.getMax() + maxSpace;
299    
300                                    TreeSet group = new TreeSet();
301                                    group.add(fLast);
302                                    // stores the minimum position for the group
303                                    int gMin = fLast.getMin();
304    
305                                    while (i.hasNext()) {
306                                             Feature fCurrent = (Feature) i.next();
307    
308                                             if (fCurrent.getMin() <= max) {
309                                                      // within cluster spacing
310                                                      group.add(fCurrent);
311    
312                                                      if (fCurrent.getMax() > fLast.getMax()) {
313                                                                    // new maximum
314                                                                    fLast = fCurrent;
315                                                                    max = fLast.getMax() + maxSpace;
316                                                      }
317                                             } else {
318                                                      if (group.size() >= threshold) {
319                                                                    // enough Features in cluster to warrent
320                                                                    // inclusion in output
321    
322                                                                    if (group.size() > 1) {
323                                                                             // more than one Feature in cluster to
324                                                                             // make new Feature to span the set of
325                                                                             // Features
326                                                                             clusters.add(new ExactFeature(gMin, fLast.getMax(), 
327                                                                                                                                                             ObjectHandles.getSequence(source)));
328                                                                    } else {
329                                                                             // only one Feature in group so just add
330                                                                             // that Feature
331                                                                             clusters.add(group.first());
332                                                                    }
333                                                      }
334    
335                                                      fLast = fCurrent;
336                                                      max = fLast.getMax() + maxSpace;
337    
338                                                      group.clear();
339                                                      group.add(fLast);
340                                                      gMin = fLast.getMin();
341                                             }
342                                    }
343    
344                                    // add last group, if present
345                                    if (group.size() >= threshold) {
346                                             // enough Features in cluster to warrent inclusion in
347                                             // output
348                                             if (group.size() > 1) {
349                                                      // more than one Feature in cluster so make new
350                                                      // Feature to span the set of Features
351                                                      clusters.add(new ExactFeature(gMin, fLast.getMax(), 
352                                                                                                                                      ObjectHandles.getSequence(source)));
353                                             } else {
354                                                      // only one Feature in group so just add that
355                                                      // Feature
356                                                      clusters.add(group.first());
357                                             }
358                                    }
359                                    
360                      }
361                      return clusters;
362             }
363    
364             /**
365              * This will merge all overlapping Features in the track,
366              * creating new Feature objects as necessary.  If no Features in
367              * 'track', then this returns an empty TreeSet.
368              */
369             public static TreeSet mergeContiguous(Track track) { 
370                      TreeSet newFeatures = new TreeSet();
371    
372                      // don't do anything if no Features
373                      if (track.numFeatures() == 0) return newFeatures;
374    
375                      // step through the Features one Sequence at a time
376                      for (Iterator s = track.getSourceSet().iterator(); s.hasNext();) {
377                                    Sequence source = (Sequence) ObjectHandles.sequencePool.get(s.next());
378                                    TreeSet features = track.featuresBySource(source);
379    
380                                    Iterator i = features.iterator();
381                                    Feature prevFeature = (Feature) i.next();
382                                    int max = prevFeature.getMax();
383                                    // merge the attributes for all Features that are merged
384                                    HashMap attribs = prevFeature.getAttributesMap();
385    
386                                    // loop through all Features on the current Sequence.
387                                    // Since they are in a TreeSet, they are already sorted by
388                                    // their minimum values.
389                                    while (i.hasNext()) {
390                                             Feature feature = (Feature) i.next();
391    
392                                             if ((max + 1) < feature.getMin()) { 
393                                                      // we found a gap, so the current Feature 'feature'
394                                                      // doesn't overlap the previous Feature(s).  So we
395                                                      // need to add the previous Feature(s) and then
396                                                      // start again using the current Feature.
397                                                      if (prevFeature.getMax() == max) {
398                                                                    // didn't find any overlapping Features, so
399                                                                    // just add the previous Feature
400                                                                    newFeatures.add(prevFeature);
401                                                      } else {
402                                                                    // need to create a new Feature that spans
403                                                                    // the current overlapping Features
404                                                                    Feature newFeature = new ExactFeature(prevFeature.getMin(), max, source);
405                                                                    newFeature.setAttributes(attribs);
406                                                                    newFeatures.add(newFeature);
407                                                      }
408    
409                                                      prevFeature = feature;
410                                                      max = feature.getMax();
411                                                      attribs = feature.getAttributesMap();
412                                             } else {
413                                                      // overlapping Features, so incremement 'max' if
414                                                      // necessary and add the current Feature
415                                                      // attributes
416                                                      if (feature.getMax() > max) max = feature.getMax();
417                                                      attribs.putAll(feature.getAttributesMap());
418                                             }
419                                    }
420    
421                                    // need to add the final Feature
422                                    if (prevFeature.getMax() == max) {
423                                             // didn't find any overlapping Features, so
424                                             // just add the previous Feature
425                                             newFeatures.add(prevFeature);
426                                    } else {
427                                             // need to create a new Feature that spans
428                                             // the current overlapping Features
429                                             Feature newFeature = new ExactFeature(prevFeature.getMin(), max, source);
430                                             newFeature.setAttributes(attribs);
431                                             newFeatures.add(newFeature);
432                                    }
433                      }
434    
435                      return newFeatures;
436             }
437    
438    } // FeatureUtils.java