diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java deleted file mode 100755 index c30da828e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.completegenomics; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * a codec for the VAR file types produced by the Complete Genomics Institute - */ -public class CGVarCodec implements FeatureCodec { - - private static final String REF_TYPE = "ref"; - private static final String SNP_TYPE = "snp"; - private static final String DELETION_TYPE = "del"; - private static final String INSERTION_TYPE = "ins"; - private static final String SUBSTITUTION_TYPE = "sub"; - - // the minimum number of features in the CG file line - private static final int minimumFeatureCount = 8; - - /** - * decode the location only - * @param line the input line to decode - * @return a HapMapFeature - */ - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * decode the CG record - * @param line the input line to decode - * @return a VariantContext - */ - public Feature decode(String line) { - String[] array = line.split("\\s+"); - - // make sure the split was successful - that we got an appropriate number of fields - if ( array.length < minimumFeatureCount ) - return null; - - String type = array[6]; - - long start = Long.valueOf(array[4]); - long end; - Allele ref, alt = null; - - //System.out.println(line); - - if ( type.equals(SNP_TYPE) ) { - ref = Allele.create(array[7], true); - alt = Allele.create(array[8], false); - end = start; - } else if ( type.equals(INSERTION_TYPE) ) { - ref = Allele.create(Allele.NULL_ALLELE_STRING, true); - alt = Allele.create(array[7], false); - end = start; - } else if ( type.equals(DELETION_TYPE) ) { - ref = Allele.create(array[7], true); - alt = Allele.create(Allele.NULL_ALLELE_STRING, false); - end = start + ref.length(); - //} else if ( type.equals(REF_TYPE) ) { - // ref = Allele.create("N", true); // ref bases aren't accurate - // start++; - // end = start; - //} else if ( type.equals(SUBSTITUTION_TYPE) ) { - // ref = Allele.create(array[7], true); - // alt = Allele.create(array[8], false); - // end = start + Math.max(ref.length(), alt.length()); - } else { - return null; // we don't handle other types - } - - HashSet alleles = new HashSet(); - alleles.add(ref); - if ( alt != null ) - alleles.add(alt); - - HashMap attrs = new HashMap(); - String id = array[array.length - 1]; - if ( id.indexOf("dbsnp") != -1 ) { - attrs.put(VariantContext.ID_KEY, parseID(id)); - } - - // create a new feature given the array - return new VariantContext("CGI", array[3], start, end, alleles, VariantContext.NO_NEG_LOG_10PERROR, null, attrs); - } - - public Class getFeatureType() { - return VariantContext.class; - } - - // There's no spec and no character to distinguish header lines... - private final static int NUM_HEADER_LINES = 12; - public Object readHeader(LineReader reader) { - return null; - - //String headerLine = null; - //try { - // for (int i = 0; i < NUM_HEADER_LINES; i++) - // headerLine = reader.readLine(); - //} catch (IOException e) { - // throw new IllegalArgumentException("Unable to read a line from the line reader"); - //} - //return headerLine; - } - - private static final Pattern DBSNP_PATTERN = Pattern.compile("^dbsnp\\.\\d+:(.*)"); - private String parseID(String raw) { - StringBuilder sb = null; - - String[] ids = raw.split(";"); - for ( String id : ids ) { - Matcher matcher = DBSNP_PATTERN.matcher(id); - if ( matcher.matches() ) { - String rsID = matcher.group(1); - if ( sb == null ) { - sb = new StringBuilder(rsID); - } else { - sb.append(";"); - sb.append(rsID); - } - } - } - - return sb == null ? null : sb.toString(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java deleted file mode 100755 index 284c43e90..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.soapsnp; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.NameAwareCodec; -import org.broad.tribble.TribbleException; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -/** - * @author depristo - *

- * a codec for parsing soapsnp files (see http://soap.genomics.org.cn/soapsnp.html#usage2) - *

- * - * A simple text file format with the following whitespace separated fields: - * -1) Chromosome ID -2) Coordinate on chromosome, start from 1 -3) Reference genotype -4) Consensus genotype -5) Quality score of consensus genotype -6) Best base -7) Average quality score of best base -8) Count of uniquely mapped best base -9) Count of all mapped best base -10) Second best bases -11) Average quality score of second best base -12) Count of uniquely mapped second best base -13) Count of all mapped second best base -14) Sequencing depth of the site -15) Rank sum test p_value -16) Average copy number of nearby region -17) Whether the site is a dbSNP. - */ -public class SoapSNPCodec implements FeatureCodec, NameAwareCodec { - private String[] parts; - - // we store a name to give to each of the variant contexts we emit - private String name = "Unknown"; - - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * Decode a line as a Feature. - * - * @param line - * - * @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is - * a comment) - */ - public Feature decode(String line) { - try { - // parse into lines - parts = line.trim().split("\\s+"); - - // check that we got the correct number of tokens in the split - if (parts.length != 18) - throw new CodecLineParsingException("Invalid SoapSNP row found -- incorrect element count. Expected 18, got " + parts.length + " line = " + line); - - String contig = parts[0]; - long start = Long.valueOf(parts[1]); - AlleleAndGenotype allelesAndGenotype = parseAlleles(parts[2], parts[3], line); - - double negLog10PError = Integer.valueOf(parts[4]) / 10.0; - - Map attributes = new HashMap(); - attributes.put("BestBaseQ", parts[6]); - attributes.put("SecondBestBaseQ", parts[10]); - attributes.put("RankSumP", parts[15]); - // add info to keys - - //System.out.printf("Alleles = " + allelesAndGenotype.alleles); - //System.out.printf("genotype = " + allelesAndGenotype.genotype); - - VariantContext vc = new VariantContext(name, contig, start, start, allelesAndGenotype.alleles, allelesAndGenotype.genotype, negLog10PError, VariantContext.PASSES_FILTERS, attributes); - - //System.out.printf("line = %s%n", line); - //System.out.printf("vc = %s%n", vc); - - return vc; - } catch (CodecLineParsingException e) { - throw new TribbleException("Unable to parse line " + line,e); - } catch (NumberFormatException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - throw new TribbleException("Unable to parse line " + line,e); - } - } - - private static class AlleleAndGenotype { - Collection alleles; - Collection genotype; - - public AlleleAndGenotype(Collection alleles, Genotype genotype) { - this.alleles = alleles; - this.genotype = new HashSet(); - this.genotype.add(genotype); - } - } - - private AlleleAndGenotype parseAlleles(String ref, String consensusGenotype, String line) { - /* A Adenine - C Cytosine - G Guanine - T (or U) Thymine (or Uracil) - R A or G - Y C or T - S G or C - W A or T - K G or T - M A or C - B C or G or T - D A or G or T - H A or C or T - V A or C or G - N any base - . or - gap - */ - if ( ref.equals(consensusGenotype) ) - throw new TribbleException.InternalCodecException("Ref base and consensus genotype are the same " + ref); - - Allele refAllele = Allele.create(ref, true); - List genotypeAlleles = null; - - char base = consensusGenotype.charAt(0); - - switch ( base ) { - case 'A': case 'C': case 'G': case 'T': - Allele a = Allele.create(consensusGenotype); - genotypeAlleles = Arrays.asList(a, a); - break; - case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M': - genotypeAlleles = determineAlt(refAllele, ref.charAt(0), base); - break; - default: - throw new TribbleException("Unexpected consensus genotype " + consensusGenotype + " at line = " + line); - } - - - Collection alleles = new HashSet(genotypeAlleles); - alleles.add(refAllele); - Genotype genotype = new Genotype("unknown", genotypeAlleles); // todo -- probably should include genotype quality - - return new AlleleAndGenotype( alleles, genotype ); - } - - private static final Map IUPAC_SNPS = new HashMap(); - static { - IUPAC_SNPS.put('R', "AG"); - IUPAC_SNPS.put('Y', "CT"); - IUPAC_SNPS.put('S', "GC"); - IUPAC_SNPS.put('W', "AT"); - IUPAC_SNPS.put('K', "GT"); - IUPAC_SNPS.put('M', "AC"); - } - - private List determineAlt(Allele ref, char refbase, char alt) { - String alts = IUPAC_SNPS.get(alt); - if ( alts == null ) - throw new IllegalStateException("BUG: unexpected consensus genotype " + alt); - - Allele a1 = alts.charAt(0) == refbase ? ref : Allele.create((byte)alts.charAt(0)); - Allele a2 = alts.charAt(1) == refbase ? ref : Allele.create((byte)alts.charAt(1)); - - //if ( a1 != ref && a2 != ref ) - // throw new IllegalStateException("BUG: unexpected consensus genotype " + alt + " does not contain the reference base " + ref); - - return Arrays.asList(a1, a2); - } - - /** - * @return VariantContext - */ - public Class getFeatureType() { - return VariantContext.class; - } - - public Object readHeader(LineReader reader) { - - return null; // we don't have a meaningful header - } - - /** - * get the name of this codec - * @return our set name - */ - public String getName() { - return name; - } - - /** - * set the name of this codec - * @param name new name - */ - public void setName(String name) { - this.name = name; - } - - public static void main(String[] args) { - System.out.printf("Testing " + args[0]); - } -} \ No newline at end of file