internal changes and some refactoring. slightly different final report. Now can take tracks that implement either Genotype or GenotypeList; takes an arg specifying what variants to look for (POINT - aka snp - or INDEL); takes an arg specifying whether default ref/ref call of one type (INDEL/POINT) should be implicitly assumed if another call (POINT/INDEL respectively) was made at the same position [this is probably most useful for indels and only (?) for sam pileups: if we have only point mutation call at a given position, it does mean that we do have coverage, and that there was no evidence whatsoever for an indel, so we have an implicit 'no-indel' call]

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@799 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-05-22 17:25:09 +00:00
parent d5bb4d9ba9
commit a9dfbfb309
1 changed files with 268 additions and 118 deletions

View File

@ -1,173 +1,225 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import java.util.Arrays;
import java.util.List;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.GenotypeList;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
import org.broadinstitute.sting.gatk.refdata.Genotype;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.playground.utils.GenotypingCallStats;
import org.broadinstitute.sting.playground.utils.TrioConcordanceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
@Requires(value=DataSource.REFERENCE,referenceMetaData={@RMD(name="mother",type=rodSAMPileup.class),
@RMD(name="father",type=rodSAMPileup.class),
@RMD(name="daughter",type=rodSAMPileup.class)})
//@Requires(value=DataSource.REFERENCE,referenceMetaData={@RMD(name="mother",type=rodSAMPileup.class),
// @RMD(name="father",type=rodSAMPileup.class),
// @RMD(name="daughter",type=rodSAMPileup.class)})
public class MendelianInheritanceWalker extends RefWalker<TrioConcordanceRecord, TrioConcordanceRecord> {
@Argument(fullName="consensus_cutoff", shortName="XC", doc="consensus cutoff", required=true ) public Double CONS_CUTOFF;
@Argument(fullName="snp_cutoff", shortName="XS", doc="snp cutoff", required=true ) public Double SNP_CUTOFF;
@Argument(fullName="indel_cutoff", shortName="XI", doc="indel cutoff", required=true ) public Double INDEL_CUTOFF;
@Argument(fullName="point_consensus_cutoff", shortName="XPC", doc="confidence cutoff for consensus in point genotype", required=true ) public Double POINT_CONS_CUTOFF;
@Argument(fullName="point_variant_cutoff", shortName="XPV", doc="confidence cutoff for variant (snp) in point genotype", required=true ) public Double POINT_VAR_CUTOFF;
@Argument(fullName="indel_consensus_cutoff", shortName="XIC", doc="confidence cutoff for consensus in indel genotype", required=true ) public Double INDEL_CONS_CUTOFF;
@Argument(fullName="indel_variant_cutoff", shortName="XIV", doc="confidence cutoff for variant (snp) in indel genotype", required=true ) public Double INDEL_VAR_CUTOFF;
@Argument(fullName="log_concordant", shortName="LC",doc="If set, all trio-concordant sites will be logged at level INFO") public boolean LOG_CONCORDANT;
@Argument(fullName="log_discordant", shortName="LD",doc="If set, all trio-discordant sites will be logged at level INFO") public boolean LOG_DISCORDANT;
@Argument(fullName="default_reference_calls",shortName="DRC",
doc="If set to INDEL or POINT, any position where the specified genotype is NOT explicitly specified, while the other (point or indel, respectively) is provided, is considered to be a confident 'reference' (no-indel or no-snp) call")
public String defCalls;
@Argument(fullName="variant_type",
shortName="VT",
doc="Assess concordance for the variants of the specified type, INDEL or POINT. If genotype track(s) provide both types, the requested one will be selected",
required=true)
public String VARIANT_TYPE;
private static Logger logger = Logger.getLogger(MendelianInheritanceWalker.class);
private final static String star = new String("*");
private int variant_type = 1; //0 - point, 1 - indel
private int default_calls = 0; // 0 - none, 1 - indels, 2 - point, 3 - both
@Override
public TrioConcordanceRecord map(RefMetaDataTracker rodData, char ref, LocusContext context) {
// String outLine = new String(context.getLocation() + " REF: "+ref + " RODS:" + rodData.getAllRods().size());
ReferenceOrderedDatum rodMom = rodData.lookup("mother", null);
ReferenceOrderedDatum rodDad = rodData.lookup("father", null);
ReferenceOrderedDatum rodKid = rodData.lookup("daughter", null);
Genotype mom = extractGenotype(rodMom,variant_type);
Genotype dad = extractGenotype(rodDad,variant_type);
Genotype kid = extractGenotype(rodKid,variant_type);
return assessGenotypesInTrio(mom, dad, kid);
}
@Override
public void initialize() {
super.initialize();
VARIANT_TYPE = VARIANT_TYPE.toUpperCase();
if ( VARIANT_TYPE.equals("POINT")) variant_type = 0;
else if ( VARIANT_TYPE.equals("INDEL")) variant_type = 1;
else throw new StingException("Unknown value specified for VARIANT_TYPE. Allowed: POINT, INDEL; passed: "+VARIANT_TYPE);
if ( defCalls == null ) return;
defCalls = defCalls.toUpperCase();
if ( defCalls.equals("INDEL")) default_calls = 1;
else throw new StingException("POINT or BOTH default calls are not implemented yet");
};
private Genotype extractGenotype(ReferenceOrderedDatum gl, int variant_type) {
if ( gl == null ) return null;
if ( gl instanceof GenotypeList ) {
if ( variant_type == 0 ) return ((GenotypeList)gl).getPointGenotype();
else if ( variant_type == 1 ) {
if ( ((GenotypeList)gl).hasIndelGenotype() ) return ((GenotypeList)gl).getIndelGenotype();
else {
if ( ( default_calls == 1|| default_calls == 3 ) && ((GenotypeList)gl).hasPointGenotype() ) return new DefaultIndelGenotype(gl.getLocation());
}
}
} else if ( gl instanceof Genotype ) {
switch ( variant_type ) {
case 0:
if ( ((Genotype)gl).isIndelGenotype() ) return null;
else return (Genotype)gl;
case 1:
if ( ((Genotype)gl).isPointGenotype() ) return null;
else return (Genotype)gl;
default: throw new StingException("Unknown variant type specified");
}
}
else throw new StingException("track "+gl.getName()+" is not a Genotype or GenotypeList");
return null;
}
/** Takes a single genotype object and returns properly filled new assessment object (covered/assessed/ref/variant set to 0/1
* according to what the genotype says)
* @param g
* @return
*/
protected GenotypingCallStats assessGenotype(Genotype g, GenotypingCallStats stats) {
if ( g != null ) stats.covered = 1;
if ( hasCall(g)) {
stats.assessed = 1;
if ( g.isReference() ) stats.ref = 1;
else {
stats.variant = 1;
if ( ! g.isBiallelic() ) stats.non_biallelic_variant = 1;
}
}
return stats;
}
public TrioConcordanceRecord assessGenotypesInTrio(Genotype gMom, Genotype gDad, Genotype gKid) {
TrioConcordanceRecord t = new TrioConcordanceRecord();
// String outLine = new String(context.getLocation() + " REF: "+ref + " RODS:" + rodData.getAllRods().size());
Genotype mom = (rodSAMPileup)rodData.lookup("mother", null);
Genotype dad = (rodSAMPileup)rodData.lookup("father", null);
Genotype kid = (rodSAMPileup)rodData.lookup("daughter", null);
if ( hasCall(mom)) {
t.mom_assessed = 1;
if ( mom.isReference() ) t.mom_ref = 1;
else if ( mom.isSNP() ) t.mom_snp = 1;
else if ( mom.isIndel() ) t.mom_indel = 1;
}
if ( hasCall(dad)) {
t.dad_assessed = 1;
if ( dad.isReference() ) t.dad_ref = 1;
else if ( dad.isSNP() ) t.dad_snp = 1;
else if ( dad.isIndel() ) t.dad_indel = 1;
}
// first get separate stats on each individual
assessGenotype(gMom,t.mom);
assessGenotype(gDad,t.dad);
assessGenotype(gKid,t.kid);
if ( hasCall(kid)) {
t.kid_assessed = 1;
if ( kid.isReference() ) t.kid_ref = 1;
else if ( kid.isSNP() ) t.kid_snp = 1;
else if ( kid.isIndel() ) t.kid_indel = 1;
}
// if ( hasCall(mom) && mom.isIndel() ) System.out.println("GOT INDEL: "+mom.toString());
if (( t.mom_assessed + t.dad_assessed + t.kid_assessed) != 3 ) return t; // at least one person is not called; nothing to do, bail out
// proceed only if we got confident calls
t.assessed_loci = 1;
if ( t.mom.covered == 0 || t.dad.covered == 0 || t.kid.covered== 0 ) return t; // current position is not covered in at least one individual, there's nothing else to do
t.trio.covered = 1; // ok, base covered in a trio (e.g. in all individuals)
// ok, we got a call and it is biallelic (at most)
if ( mom.isReference() && dad.isReference() && kid.isReference() ) { // got all refs, we are done
t.consistent_ref = 1;
if ( t.mom.assessed != 0 && t.dad.assessed != 0 && t.kid.assessed != 0 ) {
t.trio.assessed = 1; // assessed in trio = assessed in each individual
} else return t; // at least one individual is not assessed, nothing left to do
// we are here only if everyone is assessed
// NOTE: "consistent_ref" in individuals is counted only when all 3 are assessed AND all 3 are ref (i.e. ref call in a trio)
if( gMom.isReference() && gDad.isReference() && gKid.isReference() ) { // everyone is a ref
t.trio.ref = t.trio.consistent_ref = 1;
t.mom.consistent_ref = 1;
t.dad.consistent_ref = 1;
t.kid.consistent_ref = 1;
return t; // done
}
// by now we know that there's a variant in at least one of the individuals
t.trio.variant = 1;
if ( t.mom.non_biallelic_variant == 1 || t.dad.non_biallelic_variant == 1 || t.kid.non_biallelic_variant == 1 ) {
t.trio.non_biallelic_variant = 1;
return t;
}
// by now we know that there's a SNP or an indel: at least one of the individuals is non-ref
String kid_allele_1 = gKid.getFWDAlleles().get(0);
String kid_allele_2 = gKid.getFWDAlleles().get(1);
List<String> mom_alleles = gMom.getFWDAlleles();
List<String> dad_alleles = gDad.getFWDAlleles();
String kid_allele_1 = kid.getFWDAlleles().get(0);
String kid_allele_2 = kid.getFWDAlleles().get(1);
List<String> mom_alleles = mom.getFWDAlleles();
List<String> dad_alleles = dad.getFWDAlleles();
if ( ! mom.isIndel() && ! dad.isIndel() && ! kid.isIndel() ) {
// individuals have SNPs (any member of the trio is ok, we'll check for consistency in a sec), but no indels at the site:
// warning: no special processing for X/Y chromosomes yet; not an issue for daughter
// warning: no special processing for X/Y chromosomes yet; not an issue for daughter
if ( ! mom.isBiallelic() || ! dad.isBiallelic() || ! kid.isBiallelic() ) {
t.non_biallelic_snp = 1; // we currently ignore non-biallelic sites
return t;
}
if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) ||
if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) ||
mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) {
t.consistent_snp = 1;
t.trio.consistent_variant = 1;
if ( ! gMom.isReference() ) {
t.mom.consistent_variant = 1 ;
if ( ! gKid.isReference() ) t.mom_passed_variant = 1;
}
if ( ! gDad.isReference() ) {
t.dad.consistent_variant = 1 ;
if ( ! gKid.isReference() ) t.dad_passed_variant = 1;
}
if ( ! gKid.isReference() ) t.kid.consistent_variant = 1 ;
if ( LOG_CONCORDANT ) {
logger.info("consistent SNP at "+context.getLocation() +
"("+ref+") " + mom_alleles.get(0)+"/" +mom_alleles.get(1) + " " +
dad_alleles.get(0)+"/" +dad_alleles.get(1) + " " +
kid_allele_1+"/" +kid_allele_2
logger.info("consistent variant at "+gMom.getLocation() +
"("+gMom.getFWDRefBases()+") mom: " + genotypeString(gMom) + " dad: " +
genotypeString(gDad) + " kid: " + genotypeString(gKid)
);
}
} else {
} else {
// we are inconsistent. let's see what happened:
if ( kid.isSNP() && ! mom.isSNP() && ! dad.isSNP() ) t.missing_snp_in_parents = 1;
if ( ! kid.isSNP() && ( mom.isSNP() && mom.isHom() || dad.isSNP() && dad.isHom() ) ) t.missing_snp_in_kid = 1;
t.inconsistent_snp = 1;
t.trio.inconsistent_variant = 1;
if ( ! gMom.isReference() ) t.mom.inconsistent_variant = 1 ;
if ( ! gDad.isReference() ) t.dad.inconsistent_variant = 1 ;
if ( ! gKid.isReference() ) t.kid.inconsistent_variant = 1;
if ( gKid.isReference() && ( ! gMom.isReference() && gMom.isHom() || ! gDad.isReference() && gDad.isHom() ) ) t.missing_variant_in_kid = 1;
if ( ! gKid.isReference() && gMom.isReference() && gDad.isReference() ) t.missing_variant_in_parents = 1;
if ( ! gKid.isReference() && ( ! gMom.isReference() || ! gDad.isReference() ) ) t.nonmatching_variant_in_kid = 1;
if ( LOG_DISCORDANT ) {
logger.info("INconsistent SNP at "+context.getLocation() +
"("+ref+") mom:" + mom_alleles.get(0)+"/" +mom_alleles.get(1) + " dad: " +
dad_alleles.get(0)+"/" +dad_alleles.get(1) + " kid: " +
kid_allele_1+"/" +kid_allele_2
logger.info("inconsistent variant at "+gMom.getLocation() +
"("+gMom.getFWDRefBases()+") mom: " + genotypeString(gMom) + " dad: " +
genotypeString(gDad) + " kid: " + genotypeString(gKid)
);
}
}
return t;
}
if ( ! mom.isSNP() && ! dad.isSNP() && ! kid.isSNP() ) {
// individuals have indels (any member of the trio is ok, we'll check for consistency in a seq), but no indels at the site:
// warning: no special processing for X/Y chromosomes yet; not an issue for daughter
if ( ! mom.isBiallelic() || ! dad.isBiallelic() || ! kid.isBiallelic() ) {
t.non_biallelic_indel = 1; // we currently ignore non-biallelic sites
return t;
}
if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) ||
mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) {
t.consistent_indels = 1;
if ( LOG_CONCORDANT ) {
logger.info("consistent INDEL at "+context.getLocation() +
"("+ref+") mom:" + genotypeString(mom)+ " dad: " +
genotypeString(dad) + " kid: " +
genotypeString(kid)
);
}
} else {
if ( kid.isIndel() && ! mom.isIndel() && ! dad.isIndel() ) t.missing_indels_in_parents = 1;
if ( ! kid.isIndel() && ( mom.isIndel() && mom.isHom() || dad.isIndel() && dad.isHom() ) ) t.missing_indels_in_kid = 1;
t.inconsistent_indels = 1;
if ( LOG_DISCORDANT ) {
logger.info("INconsistent INDEL at "+context.getLocation() +
"("+ref+") mom:" + genotypeString(mom)+ " dad: " +
genotypeString(dad) + " kid: " +
genotypeString(kid)
);
}
}
return t;
}
t.unclassified_events = 1;
return t;
}
protected String alleleString(Genotype g, int n) {
if ( g.getFWDAlleles().get(n).length() == 0 ) return star;
return g.getFWDAlleles().get(n);
@ -179,7 +231,6 @@ public class MendelianInheritanceWalker extends RefWalker<TrioConcordanceRecord
@Override
public TrioConcordanceRecord reduce(TrioConcordanceRecord value, TrioConcordanceRecord sum) {
return sum.add(value);
}
@ -191,10 +242,109 @@ public class MendelianInheritanceWalker extends RefWalker<TrioConcordanceRecord
boolean hasCall(Genotype g) {
if ( g == null ) return false; // there's no call if there's no rod data available, duh!
if ( g.isReference() ) return g.getConsensusConfidence() >= CONS_CUTOFF ;
else {
if ( g.isSNP() ) return g.getVariantConfidence() >= SNP_CUTOFF;
else return Math.max(g.getConsensusConfidence(),g.getVariantConfidence()) >= INDEL_CUTOFF;
if ( g.isReference() ) {
if ( g.isPointGenotype() ) return g.getConsensusConfidence() >= POINT_CONS_CUTOFF ;
else return g.getConsensusConfidence() >= INDEL_CONS_CUTOFF ;
}
else { // it's a variant
if ( g.isPointGenotype() ) return g.getVariantConfidence() >= POINT_VAR_CUTOFF ;
else return g.getVariantConfidence() >= INDEL_VAR_CUTOFF ;
}
}
private static class DefaultIndelGenotype implements Genotype {
private GenomeLoc location;
private static final List<String> alleles = Arrays.asList("","") ;
DefaultIndelGenotype(GenomeLoc l) {
location = l;
}
@Override
public double getConsensusConfidence() {
return 1000;
}
@Override
public List<String> getFWDAlleles() {
return alleles;
}
@Override
public String getFWDRefBases() {
return alleles.get(0);
}
@Override
public GenomeLoc getLocation() {
return location;
}
@Override
public char getRef() {
return '*';
}
@Override
public double getVariantConfidence() {
return 0.0;
}
@Override
public boolean isBiallelic() {
return false;
}
@Override
public boolean isDeletion() {
return false;
}
@Override
public boolean isHet() {
return false;
}
@Override
public boolean isHom() {
return true;
}
@Override
public boolean isIndel() {
return false;
}
@Override
public boolean isIndelGenotype() {
return true;
}
@Override
public boolean isInsertion() {
return false;
}
@Override
public boolean isPointGenotype() {
return false;
}
@Override
public boolean isReference() {
return true;
}
@Override
public boolean isSNP() {
return false;
}
@Override
public int compareTo(ReferenceOrderedDatum o) {
return location.compareTo(o.getLocation());
}
}