001    /*
002     * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of
003     * Pennsylvania.
004     *
005     * This file is part of Glo-DB.
006     * 
007     * Glo-DB is free software: you can redistribute it and/or modify it
008     * under the terms of the GNU General Public License as published by
009     * the Free Software Foundation, either version 3 of the License, or
010     * (at your option) any later version.
011     * 
012     * Glo-DB is distributed in the hope that it will be useful, but
013     * WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * General Public License for more details.
016     * 
017     * You should have received a copy of the GNU General Public License
018     * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>.
019     *
020     * @(#)FASTAParserFly.java
021     */
022    
023    package edu.upenn.gloDB.io;
024    
025    import java.util.HashMap;
026    import java.util.HashSet;
027    
028    /**
029     * Parses the header line at the beginning of each sequence.  The
030     * header line is chopped into substrings at each space (" ").  If a
031     * substring contains a ":" then it is assumed to be a key:value pair
032     * and is added to the attributes field as such.  Otherwise, it is
033     * added as a value to the 'descriptors' key.  An example of a
034     * parseable header is:
035     * 
036     * <ul> <b>Unparsed:</b><br> >CG2945 gene symbol:cin FBgn0000316
037     * seq_release:3 gene_boundaries:(X:12,390..15,908[+]) cyto:1A1-1A1
038     * (GO:0006777 "Mo-molybdopterin cofactor biosynthesis") (GO:0001700
039     * "embryonic development (sensu Insecta)") </ul>
040     * 
041     * <ul> <b>Parsed:</b><br> {dbxref=[GO:0006777 "Mo-molybdopterin
042     * cofactor biosynthesis", GO:0001700 "embryonic development (sensu
043     * Insecta)"], strand=+, cyto=1A1-1A1, seq_release=3,
044     * gene_boundaries=X:12,390..15,908, descriptors=gene FBgn0000316,
045     * symbol=cin, ID=CG2945} </ul>
046     *
047     * @XXX can we assume that the header starts with a sequence ID?
048     *
049     * @author  Stephen Fisher
050     * @version $Id: FASTAParserFly.java,v 1.7.2.4 2007/03/01 21:17:33 fisher Exp $
051     */
052    
053    public class FASTAParserFly implements FASTAParser {
054    
055             public HashMap parseHeader(String header) {
056                      HashMap attributes = new HashMap();
057    
058                      // store descriptors as a string because some descriptors are
059                      // more than one word long.
060                      String descriptors = "";
061    
062                      // some sequences will have more than one dbxref.
063                      HashSet dbxref = new HashSet();
064    
065                      // chop off the ">" from the beginning of the header
066                      header = header.substring(1);
067    
068                      // split header at each space 
069                      String[] attribs = header.split(" ");
070                      
071                      // the header is assumed to start with a sequence ID
072                      attributes.put("ID", attribs[0]);
073    
074                      //              GloDBUtils.printMsg("Loading: " + attribs[0]);
075    
076                      String[] tmp;
077                      int i = 1;
078                      while (i < attribs.length) {
079                                    String value = attribs[i];
080    
081                                    // test if a dbxref which is contained in "()"
082                                    if (value.startsWith("(")) {
083                                             // remove initial parenthesis
084                                             value = value.substring(1);
085    
086                                             // continue reading dbxref - ends with ")"
087                                             i += 1;
088                                             while (i < attribs.length) {
089                                                      value += " " + attribs[i];
090                                                      if (attribs[i].endsWith(")")) { break; }
091                                                      i += 1;
092                                             }
093    
094                                             // remove ")"
095                                             value = value.substring(0, value.length()-1);
096    
097                                             // add to dbxref hashSet
098                                             dbxref.add(value);
099    
100                                    } else { // not a dbxref, so try to split at the first ":"
101                                             // split the substring at the first ":"
102                                             tmp = attribs[i].split(":", 2);
103                                    
104                                             if (tmp.length == 1) {
105                                                      // tmp only has one value (the substring doesn't
106                                                      // contain a ":"), so add it as a descriptor.
107                                                      if (descriptors.length() > 0) { descriptors += " "; }
108                                                      descriptors += tmp[0];
109                                                      i += 1;
110                                                      continue;
111    
112                                             } else if (tmp[0].equalsIgnoreCase("gene_boundaries")) {
113                                                      // XXX: need to create locations here. will need
114                                                      // to reference what sequence??  what is the
115                                                      // format for the position information?  can a
116                                                      // feature have more than one position pair?
117    
118                                                      // remove parenthesis surrounding positions.
119                                                      value = tmp[1].substring(1);
120                                                      value = value.substring(0, value.length()-1);
121    
122                                                      // if ends with "]", then contains strand information "+/-"
123                                                      if (value.endsWith("]")) {
124                                                                    String strand = value.substring(value.length()-2, value.length()-1);
125                                                                    value = value.substring(0, value.length()-3);
126    
127                                                                    // add strand info to hashMap
128                                                                    attributes.put("strand", strand);
129                                                      }
130    
131                                                      // get the Sequence ID and start/stop boundaries
132                                                      String pos[] = value.split(":", 2);
133                                                      attributes.put("source", pos[0]);
134                                                      // XXX: This assumes that these Locations do NOT
135                                                      // have more than one position pair.
136                                                      attributes.put("boundaries", pos[1]);
137    
138                                                      // add gene_boundaries info to hashMap
139                                                      //                                              attributes.put("gene_boundaries", value);
140                                             } else {
141                                                      attributes.put(tmp[0], tmp[1]);
142                                             }
143                                    }
144                                    
145                                    i += 1;
146                      }
147                      
148                      // only add descriptors if they actually exist.
149                      if (descriptors.length() > 0) {
150                                    attributes.put("descriptors", descriptors);
151                      }
152    
153                      // only add dbxrefs if they actually exist.
154                      if (dbxref.size() > 0) {
155                                    attributes.put("dbxref", dbxref);
156                      }
157    
158                      //               GloDBUtils.printMsg("Attributes raw: " + header);
159                      //               GloDBUtils.printMsg("Attributes parsed: " + attributes);
160                      
161                      return attributes;
162             }
163    
164    }  // FASTAParserFly.java
165