001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)FASTAParserFly.java 021 */ 022 023 package edu.upenn.gloDB.io; 024 025 import java.util.HashMap; 026 import java.util.HashSet; 027 028 /** 029 * Parses the header line at the beginning of each sequence. The 030 * header line is chopped into substrings at each space (" "). If a 031 * substring contains a ":" then it is assumed to be a key:value pair 032 * and is added to the attributes field as such. Otherwise, it is 033 * added as a value to the 'descriptors' key. An example of a 034 * parseable header is: 035 * 036 * <ul> <b>Unparsed:</b><br> >CG2945 gene symbol:cin FBgn0000316 037 * seq_release:3 gene_boundaries:(X:12,390..15,908[+]) cyto:1A1-1A1 038 * (GO:0006777 "Mo-molybdopterin cofactor biosynthesis") (GO:0001700 039 * "embryonic development (sensu Insecta)") </ul> 040 * 041 * <ul> <b>Parsed:</b><br> {dbxref=[GO:0006777 "Mo-molybdopterin 042 * cofactor biosynthesis", GO:0001700 "embryonic development (sensu 043 * Insecta)"], strand=+, cyto=1A1-1A1, seq_release=3, 044 * gene_boundaries=X:12,390..15,908, descriptors=gene FBgn0000316, 045 * symbol=cin, ID=CG2945} </ul> 046 * 047 * @XXX can we assume that the header starts with a sequence ID? 048 * 049 * @author Stephen Fisher 050 * @version $Id: FASTAParserFly.java,v 1.7.2.4 2007/03/01 21:17:33 fisher Exp $ 051 */ 052 053 public class FASTAParserFly implements FASTAParser { 054 055 public HashMap parseHeader(String header) { 056 HashMap attributes = new HashMap(); 057 058 // store descriptors as a string because some descriptors are 059 // more than one word long. 060 String descriptors = ""; 061 062 // some sequences will have more than one dbxref. 063 HashSet dbxref = new HashSet(); 064 065 // chop off the ">" from the beginning of the header 066 header = header.substring(1); 067 068 // split header at each space 069 String[] attribs = header.split(" "); 070 071 // the header is assumed to start with a sequence ID 072 attributes.put("ID", attribs[0]); 073 074 // GloDBUtils.printMsg("Loading: " + attribs[0]); 075 076 String[] tmp; 077 int i = 1; 078 while (i < attribs.length) { 079 String value = attribs[i]; 080 081 // test if a dbxref which is contained in "()" 082 if (value.startsWith("(")) { 083 // remove initial parenthesis 084 value = value.substring(1); 085 086 // continue reading dbxref - ends with ")" 087 i += 1; 088 while (i < attribs.length) { 089 value += " " + attribs[i]; 090 if (attribs[i].endsWith(")")) { break; } 091 i += 1; 092 } 093 094 // remove ")" 095 value = value.substring(0, value.length()-1); 096 097 // add to dbxref hashSet 098 dbxref.add(value); 099 100 } else { // not a dbxref, so try to split at the first ":" 101 // split the substring at the first ":" 102 tmp = attribs[i].split(":", 2); 103 104 if (tmp.length == 1) { 105 // tmp only has one value (the substring doesn't 106 // contain a ":"), so add it as a descriptor. 107 if (descriptors.length() > 0) { descriptors += " "; } 108 descriptors += tmp[0]; 109 i += 1; 110 continue; 111 112 } else if (tmp[0].equalsIgnoreCase("gene_boundaries")) { 113 // XXX: need to create locations here. will need 114 // to reference what sequence?? what is the 115 // format for the position information? can a 116 // feature have more than one position pair? 117 118 // remove parenthesis surrounding positions. 119 value = tmp[1].substring(1); 120 value = value.substring(0, value.length()-1); 121 122 // if ends with "]", then contains strand information "+/-" 123 if (value.endsWith("]")) { 124 String strand = value.substring(value.length()-2, value.length()-1); 125 value = value.substring(0, value.length()-3); 126 127 // add strand info to hashMap 128 attributes.put("strand", strand); 129 } 130 131 // get the Sequence ID and start/stop boundaries 132 String pos[] = value.split(":", 2); 133 attributes.put("source", pos[0]); 134 // XXX: This assumes that these Locations do NOT 135 // have more than one position pair. 136 attributes.put("boundaries", pos[1]); 137 138 // add gene_boundaries info to hashMap 139 // attributes.put("gene_boundaries", value); 140 } else { 141 attributes.put(tmp[0], tmp[1]); 142 } 143 } 144 145 i += 1; 146 } 147 148 // only add descriptors if they actually exist. 149 if (descriptors.length() > 0) { 150 attributes.put("descriptors", descriptors); 151 } 152 153 // only add dbxrefs if they actually exist. 154 if (dbxref.size() > 0) { 155 attributes.put("dbxref", dbxref); 156 } 157 158 // GloDBUtils.printMsg("Attributes raw: " + header); 159 // GloDBUtils.printMsg("Attributes parsed: " + attributes); 160 161 return attributes; 162 } 163 164 } // FASTAParserFly.java 165