Moved CG and SOAP codecs to private

This commit is contained in:
Mark DePristo 2011-08-18 21:20:26 -04:00
parent f7414e39bc
commit d94da0b1cf
2 changed files with 0 additions and 368 deletions

View File

@ -1,159 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.completegenomics;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashMap;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* a codec for the VAR file types produced by the Complete Genomics Institute
*/
public class CGVarCodec implements FeatureCodec {
private static final String REF_TYPE = "ref";
private static final String SNP_TYPE = "snp";
private static final String DELETION_TYPE = "del";
private static final String INSERTION_TYPE = "ins";
private static final String SUBSTITUTION_TYPE = "sub";
// the minimum number of features in the CG file line
private static final int minimumFeatureCount = 8;
/**
* decode the location only
* @param line the input line to decode
* @return a HapMapFeature
*/
public Feature decodeLoc(String line) {
return decode(line);
}
/**
* decode the CG record
* @param line the input line to decode
* @return a VariantContext
*/
public Feature decode(String line) {
String[] array = line.split("\\s+");
// make sure the split was successful - that we got an appropriate number of fields
if ( array.length < minimumFeatureCount )
return null;
String type = array[6];
long start = Long.valueOf(array[4]);
long end;
Allele ref, alt = null;
//System.out.println(line);
if ( type.equals(SNP_TYPE) ) {
ref = Allele.create(array[7], true);
alt = Allele.create(array[8], false);
end = start;
} else if ( type.equals(INSERTION_TYPE) ) {
ref = Allele.create(Allele.NULL_ALLELE_STRING, true);
alt = Allele.create(array[7], false);
end = start;
} else if ( type.equals(DELETION_TYPE) ) {
ref = Allele.create(array[7], true);
alt = Allele.create(Allele.NULL_ALLELE_STRING, false);
end = start + ref.length();
//} else if ( type.equals(REF_TYPE) ) {
// ref = Allele.create("N", true); // ref bases aren't accurate
// start++;
// end = start;
//} else if ( type.equals(SUBSTITUTION_TYPE) ) {
// ref = Allele.create(array[7], true);
// alt = Allele.create(array[8], false);
// end = start + Math.max(ref.length(), alt.length());
} else {
return null; // we don't handle other types
}
HashSet<Allele> alleles = new HashSet<Allele>();
alleles.add(ref);
if ( alt != null )
alleles.add(alt);
HashMap<String, Object> attrs = new HashMap<String, Object>();
String id = array[array.length - 1];
if ( id.indexOf("dbsnp") != -1 ) {
attrs.put(VariantContext.ID_KEY, parseID(id));
}
// create a new feature given the array
return new VariantContext("CGI", array[3], start, end, alleles, VariantContext.NO_NEG_LOG_10PERROR, null, attrs);
}
public Class<VariantContext> getFeatureType() {
return VariantContext.class;
}
// There's no spec and no character to distinguish header lines...
private final static int NUM_HEADER_LINES = 12;
public Object readHeader(LineReader reader) {
return null;
//String headerLine = null;
//try {
// for (int i = 0; i < NUM_HEADER_LINES; i++)
// headerLine = reader.readLine();
//} catch (IOException e) {
// throw new IllegalArgumentException("Unable to read a line from the line reader");
//}
//return headerLine;
}
private static final Pattern DBSNP_PATTERN = Pattern.compile("^dbsnp\\.\\d+:(.*)");
private String parseID(String raw) {
StringBuilder sb = null;
String[] ids = raw.split(";");
for ( String id : ids ) {
Matcher matcher = DBSNP_PATTERN.matcher(id);
if ( matcher.matches() ) {
String rsID = matcher.group(1);
if ( sb == null ) {
sb = new StringBuilder(rsID);
} else {
sb.append(";");
sb.append(rsID);
}
}
}
return sb == null ? null : sb.toString();
}
}

View File

@ -1,209 +0,0 @@
package org.broadinstitute.sting.utils.codecs.soapsnp;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.NameAwareCodec;
import org.broad.tribble.TribbleException;
import org.broad.tribble.exception.CodecLineParsingException;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* @author depristo
* <p/>
* a codec for parsing soapsnp files (see http://soap.genomics.org.cn/soapsnp.html#usage2)
* <p/>
*
* A simple text file format with the following whitespace separated fields:
*
1) Chromosome ID
2) Coordinate on chromosome, start from 1
3) Reference genotype
4) Consensus genotype
5) Quality score of consensus genotype
6) Best base
7) Average quality score of best base
8) Count of uniquely mapped best base
9) Count of all mapped best base
10) Second best bases
11) Average quality score of second best base
12) Count of uniquely mapped second best base
13) Count of all mapped second best base
14) Sequencing depth of the site
15) Rank sum test p_value
16) Average copy number of nearby region
17) Whether the site is a dbSNP.
*/
public class SoapSNPCodec implements FeatureCodec, NameAwareCodec {
private String[] parts;
// we store a name to give to each of the variant contexts we emit
private String name = "Unknown";
public Feature decodeLoc(String line) {
return decode(line);
}
/**
* Decode a line as a Feature.
*
* @param line
*
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
* a comment)
*/
public Feature decode(String line) {
try {
// parse into lines
parts = line.trim().split("\\s+");
// check that we got the correct number of tokens in the split
if (parts.length != 18)
throw new CodecLineParsingException("Invalid SoapSNP row found -- incorrect element count. Expected 18, got " + parts.length + " line = " + line);
String contig = parts[0];
long start = Long.valueOf(parts[1]);
AlleleAndGenotype allelesAndGenotype = parseAlleles(parts[2], parts[3], line);
double negLog10PError = Integer.valueOf(parts[4]) / 10.0;
Map<String, Object> attributes = new HashMap<String, Object>();
attributes.put("BestBaseQ", parts[6]);
attributes.put("SecondBestBaseQ", parts[10]);
attributes.put("RankSumP", parts[15]);
// add info to keys
//System.out.printf("Alleles = " + allelesAndGenotype.alleles);
//System.out.printf("genotype = " + allelesAndGenotype.genotype);
VariantContext vc = new VariantContext(name, contig, start, start, allelesAndGenotype.alleles, allelesAndGenotype.genotype, negLog10PError, VariantContext.PASSES_FILTERS, attributes);
//System.out.printf("line = %s%n", line);
//System.out.printf("vc = %s%n", vc);
return vc;
} catch (CodecLineParsingException e) {
throw new TribbleException("Unable to parse line " + line,e);
} catch (NumberFormatException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
throw new TribbleException("Unable to parse line " + line,e);
}
}
private static class AlleleAndGenotype {
Collection<Allele> alleles;
Collection<Genotype> genotype;
public AlleleAndGenotype(Collection<Allele> alleles, Genotype genotype) {
this.alleles = alleles;
this.genotype = new HashSet<Genotype>();
this.genotype.add(genotype);
}
}
private AlleleAndGenotype parseAlleles(String ref, String consensusGenotype, String line) {
/* A Adenine
C Cytosine
G Guanine
T (or U) Thymine (or Uracil)
R A or G
Y C or T
S G or C
W A or T
K G or T
M A or C
B C or G or T
D A or G or T
H A or C or T
V A or C or G
N any base
. or - gap
*/
if ( ref.equals(consensusGenotype) )
throw new TribbleException.InternalCodecException("Ref base and consensus genotype are the same " + ref);
Allele refAllele = Allele.create(ref, true);
List<Allele> genotypeAlleles = null;
char base = consensusGenotype.charAt(0);
switch ( base ) {
case 'A': case 'C': case 'G': case 'T':
Allele a = Allele.create(consensusGenotype);
genotypeAlleles = Arrays.asList(a, a);
break;
case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M':
genotypeAlleles = determineAlt(refAllele, ref.charAt(0), base);
break;
default:
throw new TribbleException("Unexpected consensus genotype " + consensusGenotype + " at line = " + line);
}
Collection<Allele> alleles = new HashSet<Allele>(genotypeAlleles);
alleles.add(refAllele);
Genotype genotype = new Genotype("unknown", genotypeAlleles); // todo -- probably should include genotype quality
return new AlleleAndGenotype( alleles, genotype );
}
private static final Map<Character, String> IUPAC_SNPS = new HashMap<Character, String>();
static {
IUPAC_SNPS.put('R', "AG");
IUPAC_SNPS.put('Y', "CT");
IUPAC_SNPS.put('S', "GC");
IUPAC_SNPS.put('W', "AT");
IUPAC_SNPS.put('K', "GT");
IUPAC_SNPS.put('M', "AC");
}
private List<Allele> determineAlt(Allele ref, char refbase, char alt) {
String alts = IUPAC_SNPS.get(alt);
if ( alts == null )
throw new IllegalStateException("BUG: unexpected consensus genotype " + alt);
Allele a1 = alts.charAt(0) == refbase ? ref : Allele.create((byte)alts.charAt(0));
Allele a2 = alts.charAt(1) == refbase ? ref : Allele.create((byte)alts.charAt(1));
//if ( a1 != ref && a2 != ref )
// throw new IllegalStateException("BUG: unexpected consensus genotype " + alt + " does not contain the reference base " + ref);
return Arrays.asList(a1, a2);
}
/**
* @return VariantContext
*/
public Class<VariantContext> getFeatureType() {
return VariantContext.class;
}
public Object readHeader(LineReader reader) {
return null; // we don't have a meaningful header
}
/**
* get the name of this codec
* @return our set name
*/
public String getName() {
return name;
}
/**
* set the name of this codec
* @param name new name
*/
public void setName(String name) {
this.name = name;
}
public static void main(String[] args) {
System.out.printf("Testing " + args[0]);
}
}