Moved CG and SOAP codecs to private
This commit is contained in:
parent
f7414e39bc
commit
d94da0b1cf
|
|
@ -1,159 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.completegenomics;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* a codec for the VAR file types produced by the Complete Genomics Institute
|
||||
*/
|
||||
public class CGVarCodec implements FeatureCodec {
|
||||
|
||||
private static final String REF_TYPE = "ref";
|
||||
private static final String SNP_TYPE = "snp";
|
||||
private static final String DELETION_TYPE = "del";
|
||||
private static final String INSERTION_TYPE = "ins";
|
||||
private static final String SUBSTITUTION_TYPE = "sub";
|
||||
|
||||
// the minimum number of features in the CG file line
|
||||
private static final int minimumFeatureCount = 8;
|
||||
|
||||
/**
|
||||
* decode the location only
|
||||
* @param line the input line to decode
|
||||
* @return a HapMapFeature
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
return decode(line);
|
||||
}
|
||||
|
||||
/**
|
||||
* decode the CG record
|
||||
* @param line the input line to decode
|
||||
* @return a VariantContext
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
String[] array = line.split("\\s+");
|
||||
|
||||
// make sure the split was successful - that we got an appropriate number of fields
|
||||
if ( array.length < minimumFeatureCount )
|
||||
return null;
|
||||
|
||||
String type = array[6];
|
||||
|
||||
long start = Long.valueOf(array[4]);
|
||||
long end;
|
||||
Allele ref, alt = null;
|
||||
|
||||
//System.out.println(line);
|
||||
|
||||
if ( type.equals(SNP_TYPE) ) {
|
||||
ref = Allele.create(array[7], true);
|
||||
alt = Allele.create(array[8], false);
|
||||
end = start;
|
||||
} else if ( type.equals(INSERTION_TYPE) ) {
|
||||
ref = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
alt = Allele.create(array[7], false);
|
||||
end = start;
|
||||
} else if ( type.equals(DELETION_TYPE) ) {
|
||||
ref = Allele.create(array[7], true);
|
||||
alt = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
end = start + ref.length();
|
||||
//} else if ( type.equals(REF_TYPE) ) {
|
||||
// ref = Allele.create("N", true); // ref bases aren't accurate
|
||||
// start++;
|
||||
// end = start;
|
||||
//} else if ( type.equals(SUBSTITUTION_TYPE) ) {
|
||||
// ref = Allele.create(array[7], true);
|
||||
// alt = Allele.create(array[8], false);
|
||||
// end = start + Math.max(ref.length(), alt.length());
|
||||
} else {
|
||||
return null; // we don't handle other types
|
||||
}
|
||||
|
||||
HashSet<Allele> alleles = new HashSet<Allele>();
|
||||
alleles.add(ref);
|
||||
if ( alt != null )
|
||||
alleles.add(alt);
|
||||
|
||||
HashMap<String, Object> attrs = new HashMap<String, Object>();
|
||||
String id = array[array.length - 1];
|
||||
if ( id.indexOf("dbsnp") != -1 ) {
|
||||
attrs.put(VariantContext.ID_KEY, parseID(id));
|
||||
}
|
||||
|
||||
// create a new feature given the array
|
||||
return new VariantContext("CGI", array[3], start, end, alleles, VariantContext.NO_NEG_LOG_10PERROR, null, attrs);
|
||||
}
|
||||
|
||||
public Class<VariantContext> getFeatureType() {
|
||||
return VariantContext.class;
|
||||
}
|
||||
|
||||
// There's no spec and no character to distinguish header lines...
|
||||
private final static int NUM_HEADER_LINES = 12;
|
||||
public Object readHeader(LineReader reader) {
|
||||
return null;
|
||||
|
||||
//String headerLine = null;
|
||||
//try {
|
||||
// for (int i = 0; i < NUM_HEADER_LINES; i++)
|
||||
// headerLine = reader.readLine();
|
||||
//} catch (IOException e) {
|
||||
// throw new IllegalArgumentException("Unable to read a line from the line reader");
|
||||
//}
|
||||
//return headerLine;
|
||||
}
|
||||
|
||||
private static final Pattern DBSNP_PATTERN = Pattern.compile("^dbsnp\\.\\d+:(.*)");
|
||||
private String parseID(String raw) {
|
||||
StringBuilder sb = null;
|
||||
|
||||
String[] ids = raw.split(";");
|
||||
for ( String id : ids ) {
|
||||
Matcher matcher = DBSNP_PATTERN.matcher(id);
|
||||
if ( matcher.matches() ) {
|
||||
String rsID = matcher.group(1);
|
||||
if ( sb == null ) {
|
||||
sb = new StringBuilder(rsID);
|
||||
} else {
|
||||
sb.append(";");
|
||||
sb.append(rsID);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sb == null ? null : sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,209 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.codecs.soapsnp;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.NameAwareCodec;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author depristo
|
||||
* <p/>
|
||||
* a codec for parsing soapsnp files (see http://soap.genomics.org.cn/soapsnp.html#usage2)
|
||||
* <p/>
|
||||
*
|
||||
* A simple text file format with the following whitespace separated fields:
|
||||
*
|
||||
1) Chromosome ID
|
||||
2) Coordinate on chromosome, start from 1
|
||||
3) Reference genotype
|
||||
4) Consensus genotype
|
||||
5) Quality score of consensus genotype
|
||||
6) Best base
|
||||
7) Average quality score of best base
|
||||
8) Count of uniquely mapped best base
|
||||
9) Count of all mapped best base
|
||||
10) Second best bases
|
||||
11) Average quality score of second best base
|
||||
12) Count of uniquely mapped second best base
|
||||
13) Count of all mapped second best base
|
||||
14) Sequencing depth of the site
|
||||
15) Rank sum test p_value
|
||||
16) Average copy number of nearby region
|
||||
17) Whether the site is a dbSNP.
|
||||
*/
|
||||
public class SoapSNPCodec implements FeatureCodec, NameAwareCodec {
|
||||
private String[] parts;
|
||||
|
||||
// we store a name to give to each of the variant contexts we emit
|
||||
private String name = "Unknown";
|
||||
|
||||
public Feature decodeLoc(String line) {
|
||||
return decode(line);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a line as a Feature.
|
||||
*
|
||||
* @param line
|
||||
*
|
||||
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
||||
* a comment)
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
try {
|
||||
// parse into lines
|
||||
parts = line.trim().split("\\s+");
|
||||
|
||||
// check that we got the correct number of tokens in the split
|
||||
if (parts.length != 18)
|
||||
throw new CodecLineParsingException("Invalid SoapSNP row found -- incorrect element count. Expected 18, got " + parts.length + " line = " + line);
|
||||
|
||||
String contig = parts[0];
|
||||
long start = Long.valueOf(parts[1]);
|
||||
AlleleAndGenotype allelesAndGenotype = parseAlleles(parts[2], parts[3], line);
|
||||
|
||||
double negLog10PError = Integer.valueOf(parts[4]) / 10.0;
|
||||
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put("BestBaseQ", parts[6]);
|
||||
attributes.put("SecondBestBaseQ", parts[10]);
|
||||
attributes.put("RankSumP", parts[15]);
|
||||
// add info to keys
|
||||
|
||||
//System.out.printf("Alleles = " + allelesAndGenotype.alleles);
|
||||
//System.out.printf("genotype = " + allelesAndGenotype.genotype);
|
||||
|
||||
VariantContext vc = new VariantContext(name, contig, start, start, allelesAndGenotype.alleles, allelesAndGenotype.genotype, negLog10PError, VariantContext.PASSES_FILTERS, attributes);
|
||||
|
||||
//System.out.printf("line = %s%n", line);
|
||||
//System.out.printf("vc = %s%n", vc);
|
||||
|
||||
return vc;
|
||||
} catch (CodecLineParsingException e) {
|
||||
throw new TribbleException("Unable to parse line " + line,e);
|
||||
} catch (NumberFormatException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
throw new TribbleException("Unable to parse line " + line,e);
|
||||
}
|
||||
}
|
||||
|
||||
private static class AlleleAndGenotype {
|
||||
Collection<Allele> alleles;
|
||||
Collection<Genotype> genotype;
|
||||
|
||||
public AlleleAndGenotype(Collection<Allele> alleles, Genotype genotype) {
|
||||
this.alleles = alleles;
|
||||
this.genotype = new HashSet<Genotype>();
|
||||
this.genotype.add(genotype);
|
||||
}
|
||||
}
|
||||
|
||||
private AlleleAndGenotype parseAlleles(String ref, String consensusGenotype, String line) {
|
||||
/* A Adenine
|
||||
C Cytosine
|
||||
G Guanine
|
||||
T (or U) Thymine (or Uracil)
|
||||
R A or G
|
||||
Y C or T
|
||||
S G or C
|
||||
W A or T
|
||||
K G or T
|
||||
M A or C
|
||||
B C or G or T
|
||||
D A or G or T
|
||||
H A or C or T
|
||||
V A or C or G
|
||||
N any base
|
||||
. or - gap
|
||||
*/
|
||||
if ( ref.equals(consensusGenotype) )
|
||||
throw new TribbleException.InternalCodecException("Ref base and consensus genotype are the same " + ref);
|
||||
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
List<Allele> genotypeAlleles = null;
|
||||
|
||||
char base = consensusGenotype.charAt(0);
|
||||
|
||||
switch ( base ) {
|
||||
case 'A': case 'C': case 'G': case 'T':
|
||||
Allele a = Allele.create(consensusGenotype);
|
||||
genotypeAlleles = Arrays.asList(a, a);
|
||||
break;
|
||||
case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M':
|
||||
genotypeAlleles = determineAlt(refAllele, ref.charAt(0), base);
|
||||
break;
|
||||
default:
|
||||
throw new TribbleException("Unexpected consensus genotype " + consensusGenotype + " at line = " + line);
|
||||
}
|
||||
|
||||
|
||||
Collection<Allele> alleles = new HashSet<Allele>(genotypeAlleles);
|
||||
alleles.add(refAllele);
|
||||
Genotype genotype = new Genotype("unknown", genotypeAlleles); // todo -- probably should include genotype quality
|
||||
|
||||
return new AlleleAndGenotype( alleles, genotype );
|
||||
}
|
||||
|
||||
private static final Map<Character, String> IUPAC_SNPS = new HashMap<Character, String>();
|
||||
static {
|
||||
IUPAC_SNPS.put('R', "AG");
|
||||
IUPAC_SNPS.put('Y', "CT");
|
||||
IUPAC_SNPS.put('S', "GC");
|
||||
IUPAC_SNPS.put('W', "AT");
|
||||
IUPAC_SNPS.put('K', "GT");
|
||||
IUPAC_SNPS.put('M', "AC");
|
||||
}
|
||||
|
||||
private List<Allele> determineAlt(Allele ref, char refbase, char alt) {
|
||||
String alts = IUPAC_SNPS.get(alt);
|
||||
if ( alts == null )
|
||||
throw new IllegalStateException("BUG: unexpected consensus genotype " + alt);
|
||||
|
||||
Allele a1 = alts.charAt(0) == refbase ? ref : Allele.create((byte)alts.charAt(0));
|
||||
Allele a2 = alts.charAt(1) == refbase ? ref : Allele.create((byte)alts.charAt(1));
|
||||
|
||||
//if ( a1 != ref && a2 != ref )
|
||||
// throw new IllegalStateException("BUG: unexpected consensus genotype " + alt + " does not contain the reference base " + ref);
|
||||
|
||||
return Arrays.asList(a1, a2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return VariantContext
|
||||
*/
|
||||
public Class<VariantContext> getFeatureType() {
|
||||
return VariantContext.class;
|
||||
}
|
||||
|
||||
public Object readHeader(LineReader reader) {
|
||||
|
||||
return null; // we don't have a meaningful header
|
||||
}
|
||||
|
||||
/**
|
||||
* get the name of this codec
|
||||
* @return our set name
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the name of this codec
|
||||
* @param name new name
|
||||
*/
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.out.printf("Testing " + args[0]);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue