001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)FeatureUtils.java 021 */ 022 023 package edu.upenn.gloDB; 024 025 import java.util.TreeSet; 026 import java.util.HashMap; 027 import java.util.Iterator; 028 029 /** 030 * FeatureUtils. Miscellaneous functions that act on Features and 031 * Sets of Features. 032 * 033 * @author Stephen Fisher 034 * @version $Id: FeatureUtils.java,v 1.1.2.17 2007/03/01 21:17:32 fisher Exp $ 035 */ 036 037 public class FeatureUtils { 038 039 //-------------------------------------------------------------------------- 040 // Miscellaneous Methods 041 042 /** 043 * Returns a new Feature containing the region in 'feature' that 044 * is bounded by minB/maxB. Returns null if 'feature' doesn't 045 * overlap with minB/maxB. 046 * @XXX Doesn't deal with multiple Sequences. 047 */ 048 public static Feature contained(Feature feature, int minB, int maxB) { 049 int minA = feature.getMin(); 050 int maxA = feature.getMax(); 051 052 // check for overlap 053 if ((minA > maxB) || (maxA < minB)) return null; 054 055 int minOut = minA; 056 int maxOut = maxA; 057 if (minA < minB) minOut = minB; 058 if (maxA > maxB) maxOut = maxB; 059 060 return new ExactFeature(minOut, maxOut, feature.getSource()); 061 } 062 063 /** 064 * Returns '-1' if 'feature' exists after the integer 'pos', 065 * returns '0' if 'pos' is contained in 'feature', and '1' if 066 * 'pos' occurs after 'feature'. 067 * @XXX This assumes 'pos' is positive within this Feature's 068 * Sequence boundaries. 069 */ 070 public static int contains(Feature feature, int pos) { 071 if (feature.getMin() > pos) { 072 return -1; 073 } else if (feature.getMax() < pos) { 074 return 1; 075 } else { 076 return 0; 077 } 078 } 079 080 /** 081 * Returns 'true' if the second Feature ('featureB') is contained 082 * in the first Feature ('featureA'). If the Features are on 083 * different Sources, then returns 'false'. 084 */ 085 public static boolean contains(Feature featureA, Feature featureB) { 086 if (featureA.getSource() != featureB.getSource()) return false; 087 088 if ((featureA.getMin() > featureB.getMin()) 089 || (featureA.getMax() < featureB.getMax())) { 090 return false; 091 } else { 092 return true; 093 } 094 } 095 096 /** 097 * Returns 'true' if the Feature and Track overlap. If the 098 * Features are on different Sources, then returns 'false'. 099 * 100 * f1: t-------u 101 * aa|------------------------------|bb 102 * |012345678901234567890123456789| 103 * f2: a-b c-d e-f g---h i-j 104 * 105 * f2.max < f1.min : continue 106 * f2.min <= f1.max : true else false 107 */ 108 public static boolean overlaps(Feature featureA, Track track) { 109 // make sure the Features are on the same Sequence 110 TreeSet featuresB = track.featuresBySource(featureA.getSource()); 111 if (featuresB == null) return false; 112 113 int minA = featureA.getMin(); 114 int maxA = featureA.getMax(); 115 116 for (Iterator i = featuresB.iterator(); i.hasNext();) { 117 Feature featureB = (Feature) i.next(); 118 119 if (featureB.getMax() < minA) { 120 // no overlap 121 continue; 122 } else { 123 if (featureB.getMin() <= maxA) { 124 return true; 125 } else { 126 return false; 127 } 128 } 129 } 130 131 // if we got this far then we didn't find an overlap 132 return false; 133 } 134 135 /** 136 * Returns 'true' if the Features have overlapping positions. If 137 * the Features are on different Sources, then returns 'false'. 138 */ 139 public static boolean overlaps(Feature featureA, Feature featureB) { 140 if (featureA.getSource() != featureB.getSource()) return false; 141 142 if ((featureA.getMin() <= featureB.getMax()) && 143 (featureB.getMin() <= featureA.getMax())) { 144 // A.min <= B.max and B.min <= A.max 145 return true; 146 } else { 147 // no overlap 148 return false; 149 } 150 } 151 152 /** 153 * Returns the overlapping region between the two Features. If no 154 * overlap, then null is returned. 155 * @XXX should test for same source 156 */ 157 public static Feature overlap(Feature featureA, Feature featureB) { 158 // this will also make sure they are on the same Sequence 159 if (! overlaps(featureA, featureB)) return null; 160 161 int aMin = featureA.getMin(); 162 int aMax = featureA.getMax(); 163 int bMin = featureB.getMin(); 164 int bMax = featureB.getMax(); 165 166 // we know the Features overlap so we don't need to test for 167 // that here 168 if (aMin >= bMin) { 169 if (aMax <= bMax) { 170 return featureA; 171 } else { 172 return new ExactFeature(aMin, bMax, featureA.getSource()); 173 } 174 } else { // bMin > aMin 175 if (bMax <= aMax) { 176 return featureB; 177 } else { 178 return new ExactFeature(bMin, aMax, featureA.getSource()); 179 } 180 } 181 } 182 183 /** 184 * Compares two Features for order. Returns '-1', '0', or '1' if 185 * the first Feature ('featureA') is less than, equal to, or 186 * greater than the second Feature ('featureB'). If the Features 187 * have different Source sequences, then they will be sorted by 188 * Source ID. 189 */ 190 public static int compareFeatures(Feature featureA, Feature featureB) { 191 // int source = featureA.getSourceID().compareToIgnoreCase(featureB.getSourceID()); 192 int source = featureA.getSourceID().compareTo(featureB.getSourceID()); 193 194 if (source == 0) { // same source 195 // who ever has min is less. 196 if (featureA.getMin() < featureB.getMin()) { 197 return -1; 198 } else if (featureA.getMin() > featureB.getMin()) { 199 return 1; 200 } 201 202 // min are equal. 203 if (featureA.getMax() < featureB.getMax()) { 204 return -1; 205 } else if (featureA.getMax() > featureB.getMax()) { 206 return 1; 207 } 208 209 if (GloDBUtils.ignoreAttributes()) { 210 // don't use attributes to compare Features, so at 211 // this point the Features are the same 212 return 0; 213 } else { 214 // min and max are equal, so return the comparison of 215 // the hashCodes for each of the attributes. If we 216 // don't then when 2 Features overlap only one will be 217 // included in the Track. 218 Integer hashA = new Integer((featureA.getAttributes()).hashCode()); 219 Integer hashB = new Integer((featureB.getAttributes()).hashCode()); 220 return hashA.compareTo(hashB); 221 } 222 } else { 223 // different sources so sort by source 224 return source; 225 } 226 } 227 228 /** 229 * Returns 'true' if the set of Features does not contain gaps 230 * between Features. This assumes that the Features all occur on 231 * the same sequence (ie they refer to the same Sequence object). 232 */ 233 public static boolean isContiguous(TreeSet features) { 234 // return false if no Features 235 if ((features == null) || (features.size() == 0)) return false; 236 237 Iterator i = features.iterator(); 238 Feature feature = (Feature) i.next(); 239 int end = feature.getMax(); 240 241 while (i.hasNext()) { 242 feature = (Feature) i.next(); 243 244 // if find a gap then exit, returning false 245 if (feature.getMin() > (end + 1)) return false; 246 247 // still overlapping, so increase the end position if 248 // necessary 249 if (feature.getMax() > end) end = feature.getMax(); 250 } 251 252 // if made it this far, then no gaps in features 253 return true; 254 } 255 256 /** 257 * This will merge all Features in the Track that are within 258 * maxSpace of each other. New Features will be created to span 259 * the entire cluster. Threshold sets the minimum number of 260 * Features necessary to be considered a cluster and thus included 261 * in the output set. A new Track will be returned containing the 262 * clusters. 263 * @deprecated replaced with cluster.py 264 */ 265 public static TreeSet cluster(Track track, int maxSpace, int threshold) { 266 TreeSet clusters = new TreeSet(); 267 268 // if Track is empty then there will be no matches so just 269 // return an empty set 270 if (track.numFeatures() == 0) return clusters; 271 272 // make sure maxSpace is legal 273 if (maxSpace < 0) { 274 GloDBUtils.printError("Illegal \"maxSpace\" argument in FeatureUtils.cluster."); 275 return clusters; 276 } 277 278 // make sure maxSpace is legal 279 if (threshold < 0) { 280 GloDBUtils.printError("Illegal \"threshold\" argument in FeatureUtils.cluster."); 281 return clusters; 282 } 283 284 // Source Sets for Track 285 HashMap sources = track.getSources(); 286 287 // test Features based on source 288 for (Iterator s = sources.keySet().iterator(); s.hasNext();) { 289 String source = (String) s.next(); 290 291 // get Features for this source 292 TreeSet features = (TreeSet) sources.get(source); 293 294 Iterator i = features.iterator(); 295 Feature fLast = (Feature) i.next(); 296 // max here refers to the range for the beginning of the 297 // next Feature 298 int max = fLast.getMax() + maxSpace; 299 300 TreeSet group = new TreeSet(); 301 group.add(fLast); 302 // stores the minimum position for the group 303 int gMin = fLast.getMin(); 304 305 while (i.hasNext()) { 306 Feature fCurrent = (Feature) i.next(); 307 308 if (fCurrent.getMin() <= max) { 309 // within cluster spacing 310 group.add(fCurrent); 311 312 if (fCurrent.getMax() > fLast.getMax()) { 313 // new maximum 314 fLast = fCurrent; 315 max = fLast.getMax() + maxSpace; 316 } 317 } else { 318 if (group.size() >= threshold) { 319 // enough Features in cluster to warrent 320 // inclusion in output 321 322 if (group.size() > 1) { 323 // more than one Feature in cluster to 324 // make new Feature to span the set of 325 // Features 326 clusters.add(new ExactFeature(gMin, fLast.getMax(), 327 ObjectHandles.getSequence(source))); 328 } else { 329 // only one Feature in group so just add 330 // that Feature 331 clusters.add(group.first()); 332 } 333 } 334 335 fLast = fCurrent; 336 max = fLast.getMax() + maxSpace; 337 338 group.clear(); 339 group.add(fLast); 340 gMin = fLast.getMin(); 341 } 342 } 343 344 // add last group, if present 345 if (group.size() >= threshold) { 346 // enough Features in cluster to warrent inclusion in 347 // output 348 if (group.size() > 1) { 349 // more than one Feature in cluster so make new 350 // Feature to span the set of Features 351 clusters.add(new ExactFeature(gMin, fLast.getMax(), 352 ObjectHandles.getSequence(source))); 353 } else { 354 // only one Feature in group so just add that 355 // Feature 356 clusters.add(group.first()); 357 } 358 } 359 360 } 361 return clusters; 362 } 363 364 /** 365 * This will merge all overlapping Features in the track, 366 * creating new Feature objects as necessary. If no Features in 367 * 'track', then this returns an empty TreeSet. 368 */ 369 public static TreeSet mergeContiguous(Track track) { 370 TreeSet newFeatures = new TreeSet(); 371 372 // don't do anything if no Features 373 if (track.numFeatures() == 0) return newFeatures; 374 375 // step through the Features one Sequence at a time 376 for (Iterator s = track.getSourceSet().iterator(); s.hasNext();) { 377 Sequence source = (Sequence) ObjectHandles.sequencePool.get(s.next()); 378 TreeSet features = track.featuresBySource(source); 379 380 Iterator i = features.iterator(); 381 Feature prevFeature = (Feature) i.next(); 382 int max = prevFeature.getMax(); 383 // merge the attributes for all Features that are merged 384 HashMap attribs = prevFeature.getAttributesMap(); 385 386 // loop through all Features on the current Sequence. 387 // Since they are in a TreeSet, they are already sorted by 388 // their minimum values. 389 while (i.hasNext()) { 390 Feature feature = (Feature) i.next(); 391 392 if ((max + 1) < feature.getMin()) { 393 // we found a gap, so the current Feature 'feature' 394 // doesn't overlap the previous Feature(s). So we 395 // need to add the previous Feature(s) and then 396 // start again using the current Feature. 397 if (prevFeature.getMax() == max) { 398 // didn't find any overlapping Features, so 399 // just add the previous Feature 400 newFeatures.add(prevFeature); 401 } else { 402 // need to create a new Feature that spans 403 // the current overlapping Features 404 Feature newFeature = new ExactFeature(prevFeature.getMin(), max, source); 405 newFeature.setAttributes(attribs); 406 newFeatures.add(newFeature); 407 } 408 409 prevFeature = feature; 410 max = feature.getMax(); 411 attribs = feature.getAttributesMap(); 412 } else { 413 // overlapping Features, so incremement 'max' if 414 // necessary and add the current Feature 415 // attributes 416 if (feature.getMax() > max) max = feature.getMax(); 417 attribs.putAll(feature.getAttributesMap()); 418 } 419 } 420 421 // need to add the final Feature 422 if (prevFeature.getMax() == max) { 423 // didn't find any overlapping Features, so 424 // just add the previous Feature 425 newFeatures.add(prevFeature); 426 } else { 427 // need to create a new Feature that spans 428 // the current overlapping Features 429 Feature newFeature = new ExactFeature(prevFeature.getMin(), max, source); 430 newFeature.setAttributes(attribs); 431 newFeatures.add(newFeature); 432 } 433 } 434 435 return newFeatures; 436 } 437 438 } // FeatureUtils.java