001 /* 002 * Copyright 2007, 2012 Stephen Fisher and Junhyong Kim, University of 003 * Pennsylvania. 004 * 005 * This file is part of Glo-DB. 006 * 007 * Glo-DB is free software: you can redistribute it and/or modify it 008 * under the terms of the GNU General Public License as published by 009 * the Free Software Foundation, either version 3 of the License, or 010 * (at your option) any later version. 011 * 012 * Glo-DB is distributed in the hope that it will be useful, but 013 * WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * General Public License for more details. 016 * 017 * You should have received a copy of the GNU General Public License 018 * along with Glo-DB. If not, see <http://www.gnu.org/licenses/>. 019 * 020 * @(#)FASTAParserMinimal.java 021 */ 022 023 package edu.upenn.gloDB.io; 024 025 import java.util.HashMap; 026 027 /** 028 * Parses the header line at the beginning of each FASTA sequence. 029 * The header line is chopped into substrings at each space (" "). If 030 * a substring contains a ":" then it is assumed to be a key:value 031 * pair and is added to the attributes field as such. Otherwise, it 032 * is added as a value to the 'descriptors' key. This is very similar 033 * to FASTAParserFly.java, however it doesn't processes the 034 * gene_boundaries in any special way. Thus this should not be used 035 * for FASTA files that contain Features. 036 * 037 * @XXX this assumes that the header starts with a sequence ID. 038 * 039 * @author Stephen Fisher 040 * @version $Id: FASTAParserMinimal.java,v 1.7.2.4 2007/03/01 21:17:33 fisher Exp $ 041 */ 042 043 public class FASTAParserMinimal implements FASTAParser { 044 045 public HashMap parseHeader(String header) { 046 HashMap attributes = new HashMap(); 047 String descriptors = ""; 048 049 // chop off the ">" from the beginning of the header 050 header = header.substring(1); 051 052 // split header at each space 053 String[] attribs = header.split(" "); 054 055 // the header is assumed to start with a sequence ID 056 attributes.put("ID", attribs[0]); 057 058 // GloDBUtils.printMsg("Loading: " + attribs[0]); 059 060 String[] tmp; 061 for (int i = 1; i < attribs.length; i++) { 062 // split the substring at each colon 063 tmp = attribs[i].split(":", 2); 064 065 // if tmp only has one value, then the substring 066 // didn't contain a ":", so add it as a descriptor. 067 if (tmp.length == 1) { 068 descriptors += " " + tmp[0]; 069 } else { 070 // GloDBUtils.printMsg(attribs[i]+" "+tmp[0]+" "+tmp[1]); 071 attributes.put(tmp[0], tmp[1]); 072 } 073 } 074 075 // only add descriptors if they actually exist. 076 if (descriptors.length() > 0) { 077 attributes.put("descriptors", descriptors); 078 } 079 080 return attributes; 081 } 082 083 } // FASTAParserMinimal.java 084