From a9dfbfb30973e1a831997eee94cf655d8b782c58 Mon Sep 17 00:00:00 2001 From: asivache Date: Fri, 22 May 2009 17:25:09 +0000 Subject: [PATCH] internal changes and some refactoring. slightly different final report. Now can take tracks that implement either Genotype or GenotypeList; takes an arg specifying what variants to look for (POINT - aka snp - or INDEL); takes an arg specifying whether default ref/ref call of one type (INDEL/POINT) should be implicitly assumed if another call (POINT/INDEL respectively) was made at the same position [this is probably most useful for indels and only (?) for sam pileups: if we have only point mutation call at a given position, it does mean that we do have coverage, and that there was no evidence whatsoever for an indel, so we have an implicit 'no-indel' call] git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@799 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/MendelianInheritanceWalker.java | 386 ++++++++++++------ 1 file changed, 268 insertions(+), 118 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MendelianInheritanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MendelianInheritanceWalker.java index c04243700..0ba56f321 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MendelianInheritanceWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MendelianInheritanceWalker.java @@ -1,173 +1,225 @@ package org.broadinstitute.sting.playground.gatk.walkers; +import java.util.Arrays; import java.util.List; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.GenotypeList; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.rodSAMPileup; import org.broadinstitute.sting.gatk.refdata.Genotype; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.playground.utils.GenotypingCallStats; import org.broadinstitute.sting.playground.utils.TrioConcordanceRecord; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; -@Requires(value=DataSource.REFERENCE,referenceMetaData={@RMD(name="mother",type=rodSAMPileup.class), - @RMD(name="father",type=rodSAMPileup.class), - @RMD(name="daughter",type=rodSAMPileup.class)}) +//@Requires(value=DataSource.REFERENCE,referenceMetaData={@RMD(name="mother",type=rodSAMPileup.class), +// @RMD(name="father",type=rodSAMPileup.class), +// @RMD(name="daughter",type=rodSAMPileup.class)}) public class MendelianInheritanceWalker extends RefWalker { - @Argument(fullName="consensus_cutoff", shortName="XC", doc="consensus cutoff", required=true ) public Double CONS_CUTOFF; - @Argument(fullName="snp_cutoff", shortName="XS", doc="snp cutoff", required=true ) public Double SNP_CUTOFF; - @Argument(fullName="indel_cutoff", shortName="XI", doc="indel cutoff", required=true ) public Double INDEL_CUTOFF; + @Argument(fullName="point_consensus_cutoff", shortName="XPC", doc="confidence cutoff for consensus in point genotype", required=true ) public Double POINT_CONS_CUTOFF; + @Argument(fullName="point_variant_cutoff", shortName="XPV", doc="confidence cutoff for variant (snp) in point genotype", required=true ) public Double POINT_VAR_CUTOFF; + @Argument(fullName="indel_consensus_cutoff", shortName="XIC", doc="confidence cutoff for consensus in indel genotype", required=true ) public Double INDEL_CONS_CUTOFF; + @Argument(fullName="indel_variant_cutoff", shortName="XIV", doc="confidence cutoff for variant (snp) in indel genotype", required=true ) public Double INDEL_VAR_CUTOFF; @Argument(fullName="log_concordant", shortName="LC",doc="If set, all trio-concordant sites will be logged at level INFO") public boolean LOG_CONCORDANT; @Argument(fullName="log_discordant", shortName="LD",doc="If set, all trio-discordant sites will be logged at level INFO") public boolean LOG_DISCORDANT; + @Argument(fullName="default_reference_calls",shortName="DRC", + doc="If set to INDEL or POINT, any position where the specified genotype is NOT explicitly specified, while the other (point or indel, respectively) is provided, is considered to be a confident 'reference' (no-indel or no-snp) call") + public String defCalls; + @Argument(fullName="variant_type", + shortName="VT", + doc="Assess concordance for the variants of the specified type, INDEL or POINT. If genotype track(s) provide both types, the requested one will be selected", + required=true) + public String VARIANT_TYPE; private static Logger logger = Logger.getLogger(MendelianInheritanceWalker.class); private final static String star = new String("*"); + private int variant_type = 1; //0 - point, 1 - indel + private int default_calls = 0; // 0 - none, 1 - indels, 2 - point, 3 - both @Override public TrioConcordanceRecord map(RefMetaDataTracker rodData, char ref, LocusContext context) { + +// String outLine = new String(context.getLocation() + " REF: "+ref + " RODS:" + rodData.getAllRods().size()); + ReferenceOrderedDatum rodMom = rodData.lookup("mother", null); + ReferenceOrderedDatum rodDad = rodData.lookup("father", null); + ReferenceOrderedDatum rodKid = rodData.lookup("daughter", null); + + Genotype mom = extractGenotype(rodMom,variant_type); + Genotype dad = extractGenotype(rodDad,variant_type); + Genotype kid = extractGenotype(rodKid,variant_type); + + return assessGenotypesInTrio(mom, dad, kid); + } + + @Override + public void initialize() { + super.initialize(); + VARIANT_TYPE = VARIANT_TYPE.toUpperCase(); + if ( VARIANT_TYPE.equals("POINT")) variant_type = 0; + else if ( VARIANT_TYPE.equals("INDEL")) variant_type = 1; + else throw new StingException("Unknown value specified for VARIANT_TYPE. Allowed: POINT, INDEL; passed: "+VARIANT_TYPE); + if ( defCalls == null ) return; + defCalls = defCalls.toUpperCase(); + if ( defCalls.equals("INDEL")) default_calls = 1; + else throw new StingException("POINT or BOTH default calls are not implemented yet"); + }; + + private Genotype extractGenotype(ReferenceOrderedDatum gl, int variant_type) { + + if ( gl == null ) return null; + + if ( gl instanceof GenotypeList ) { + + if ( variant_type == 0 ) return ((GenotypeList)gl).getPointGenotype(); + else if ( variant_type == 1 ) { + if ( ((GenotypeList)gl).hasIndelGenotype() ) return ((GenotypeList)gl).getIndelGenotype(); + else { + if ( ( default_calls == 1|| default_calls == 3 ) && ((GenotypeList)gl).hasPointGenotype() ) return new DefaultIndelGenotype(gl.getLocation()); + } + } + } else if ( gl instanceof Genotype ) { + switch ( variant_type ) { + case 0: + if ( ((Genotype)gl).isIndelGenotype() ) return null; + else return (Genotype)gl; + case 1: + if ( ((Genotype)gl).isPointGenotype() ) return null; + else return (Genotype)gl; + default: throw new StingException("Unknown variant type specified"); + } + + } + else throw new StingException("track "+gl.getName()+" is not a Genotype or GenotypeList"); + return null; + } + + /** Takes a single genotype object and returns properly filled new assessment object (covered/assessed/ref/variant set to 0/1 + * according to what the genotype says) + * @param g + * @return + */ + protected GenotypingCallStats assessGenotype(Genotype g, GenotypingCallStats stats) { + + if ( g != null ) stats.covered = 1; + + if ( hasCall(g)) { + stats.assessed = 1; + if ( g.isReference() ) stats.ref = 1; + else { + stats.variant = 1; + if ( ! g.isBiallelic() ) stats.non_biallelic_variant = 1; + } + } + return stats; + } + + public TrioConcordanceRecord assessGenotypesInTrio(Genotype gMom, Genotype gDad, Genotype gKid) { + TrioConcordanceRecord t = new TrioConcordanceRecord(); // String outLine = new String(context.getLocation() + " REF: "+ref + " RODS:" + rodData.getAllRods().size()); - - Genotype mom = (rodSAMPileup)rodData.lookup("mother", null); - Genotype dad = (rodSAMPileup)rodData.lookup("father", null); - Genotype kid = (rodSAMPileup)rodData.lookup("daughter", null); - - if ( hasCall(mom)) { - t.mom_assessed = 1; - if ( mom.isReference() ) t.mom_ref = 1; - else if ( mom.isSNP() ) t.mom_snp = 1; - else if ( mom.isIndel() ) t.mom_indel = 1; - } - if ( hasCall(dad)) { - t.dad_assessed = 1; - if ( dad.isReference() ) t.dad_ref = 1; - else if ( dad.isSNP() ) t.dad_snp = 1; - else if ( dad.isIndel() ) t.dad_indel = 1; - } + // first get separate stats on each individual + assessGenotype(gMom,t.mom); + assessGenotype(gDad,t.dad); + assessGenotype(gKid,t.kid); - if ( hasCall(kid)) { - t.kid_assessed = 1; - if ( kid.isReference() ) t.kid_ref = 1; - else if ( kid.isSNP() ) t.kid_snp = 1; - else if ( kid.isIndel() ) t.kid_indel = 1; - } - - // if ( hasCall(mom) && mom.isIndel() ) System.out.println("GOT INDEL: "+mom.toString()); - if (( t.mom_assessed + t.dad_assessed + t.kid_assessed) != 3 ) return t; // at least one person is not called; nothing to do, bail out - - // proceed only if we got confident calls - - t.assessed_loci = 1; - + if ( t.mom.covered == 0 || t.dad.covered == 0 || t.kid.covered== 0 ) return t; // current position is not covered in at least one individual, there's nothing else to do + t.trio.covered = 1; // ok, base covered in a trio (e.g. in all individuals) - // ok, we got a call and it is biallelic (at most) - - if ( mom.isReference() && dad.isReference() && kid.isReference() ) { // got all refs, we are done - t.consistent_ref = 1; + if ( t.mom.assessed != 0 && t.dad.assessed != 0 && t.kid.assessed != 0 ) { + t.trio.assessed = 1; // assessed in trio = assessed in each individual + } else return t; // at least one individual is not assessed, nothing left to do + + // we are here only if everyone is assessed + + // NOTE: "consistent_ref" in individuals is counted only when all 3 are assessed AND all 3 are ref (i.e. ref call in a trio) + if( gMom.isReference() && gDad.isReference() && gKid.isReference() ) { // everyone is a ref + t.trio.ref = t.trio.consistent_ref = 1; + t.mom.consistent_ref = 1; + t.dad.consistent_ref = 1; + t.kid.consistent_ref = 1; + return t; // done + } + + // by now we know that there's a variant in at least one of the individuals + + t.trio.variant = 1; + + if ( t.mom.non_biallelic_variant == 1 || t.dad.non_biallelic_variant == 1 || t.kid.non_biallelic_variant == 1 ) { + t.trio.non_biallelic_variant = 1; return t; } - // by now we know that there's a SNP or an indel: at least one of the individuals is non-ref + String kid_allele_1 = gKid.getFWDAlleles().get(0); + String kid_allele_2 = gKid.getFWDAlleles().get(1); + List mom_alleles = gMom.getFWDAlleles(); + List dad_alleles = gDad.getFWDAlleles(); - String kid_allele_1 = kid.getFWDAlleles().get(0); - String kid_allele_2 = kid.getFWDAlleles().get(1); - List mom_alleles = mom.getFWDAlleles(); - List dad_alleles = dad.getFWDAlleles(); - - if ( ! mom.isIndel() && ! dad.isIndel() && ! kid.isIndel() ) { - // individuals have SNPs (any member of the trio is ok, we'll check for consistency in a sec), but no indels at the site: - - // warning: no special processing for X/Y chromosomes yet; not an issue for daughter + // warning: no special processing for X/Y chromosomes yet; not an issue for daughter - if ( ! mom.isBiallelic() || ! dad.isBiallelic() || ! kid.isBiallelic() ) { - t.non_biallelic_snp = 1; // we currently ignore non-biallelic sites - return t; - } - if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) || + if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) || mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) { - t.consistent_snp = 1; - + t.trio.consistent_variant = 1; + if ( ! gMom.isReference() ) { + t.mom.consistent_variant = 1 ; + if ( ! gKid.isReference() ) t.mom_passed_variant = 1; + } + if ( ! gDad.isReference() ) { + t.dad.consistent_variant = 1 ; + if ( ! gKid.isReference() ) t.dad_passed_variant = 1; + } + if ( ! gKid.isReference() ) t.kid.consistent_variant = 1 ; + if ( LOG_CONCORDANT ) { - logger.info("consistent SNP at "+context.getLocation() + - "("+ref+") " + mom_alleles.get(0)+"/" +mom_alleles.get(1) + " " + - dad_alleles.get(0)+"/" +dad_alleles.get(1) + " " + - kid_allele_1+"/" +kid_allele_2 + logger.info("consistent variant at "+gMom.getLocation() + + "("+gMom.getFWDRefBases()+") mom: " + genotypeString(gMom) + " dad: " + + genotypeString(gDad) + " kid: " + genotypeString(gKid) ); } - } else { + } else { // we are inconsistent. let's see what happened: - if ( kid.isSNP() && ! mom.isSNP() && ! dad.isSNP() ) t.missing_snp_in_parents = 1; - if ( ! kid.isSNP() && ( mom.isSNP() && mom.isHom() || dad.isSNP() && dad.isHom() ) ) t.missing_snp_in_kid = 1; - t.inconsistent_snp = 1; + t.trio.inconsistent_variant = 1; + + if ( ! gMom.isReference() ) t.mom.inconsistent_variant = 1 ; + if ( ! gDad.isReference() ) t.dad.inconsistent_variant = 1 ; + if ( ! gKid.isReference() ) t.kid.inconsistent_variant = 1; + + if ( gKid.isReference() && ( ! gMom.isReference() && gMom.isHom() || ! gDad.isReference() && gDad.isHom() ) ) t.missing_variant_in_kid = 1; + if ( ! gKid.isReference() && gMom.isReference() && gDad.isReference() ) t.missing_variant_in_parents = 1; + if ( ! gKid.isReference() && ( ! gMom.isReference() || ! gDad.isReference() ) ) t.nonmatching_variant_in_kid = 1; + if ( LOG_DISCORDANT ) { - logger.info("INconsistent SNP at "+context.getLocation() + - "("+ref+") mom:" + mom_alleles.get(0)+"/" +mom_alleles.get(1) + " dad: " + - dad_alleles.get(0)+"/" +dad_alleles.get(1) + " kid: " + - kid_allele_1+"/" +kid_allele_2 + logger.info("inconsistent variant at "+gMom.getLocation() + + "("+gMom.getFWDRefBases()+") mom: " + genotypeString(gMom) + " dad: " + + genotypeString(gDad) + " kid: " + genotypeString(gKid) ); } - } - return t; + } - - if ( ! mom.isSNP() && ! dad.isSNP() && ! kid.isSNP() ) { - // individuals have indels (any member of the trio is ok, we'll check for consistency in a seq), but no indels at the site: - - // warning: no special processing for X/Y chromosomes yet; not an issue for daughter - - if ( ! mom.isBiallelic() || ! dad.isBiallelic() || ! kid.isBiallelic() ) { - t.non_biallelic_indel = 1; // we currently ignore non-biallelic sites - return t; - } - - if ( mom_alleles.contains(kid_allele_1) && dad_alleles.contains(kid_allele_2) || - mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) { - t.consistent_indels = 1; - if ( LOG_CONCORDANT ) { - logger.info("consistent INDEL at "+context.getLocation() + - "("+ref+") mom:" + genotypeString(mom)+ " dad: " + - genotypeString(dad) + " kid: " + - genotypeString(kid) - ); - } - } else { - if ( kid.isIndel() && ! mom.isIndel() && ! dad.isIndel() ) t.missing_indels_in_parents = 1; - if ( ! kid.isIndel() && ( mom.isIndel() && mom.isHom() || dad.isIndel() && dad.isHom() ) ) t.missing_indels_in_kid = 1; - - t.inconsistent_indels = 1; - - if ( LOG_DISCORDANT ) { - logger.info("INconsistent INDEL at "+context.getLocation() + - "("+ref+") mom:" + genotypeString(mom)+ " dad: " + - genotypeString(dad) + " kid: " + - genotypeString(kid) - ); - } - } - return t; - } - - t.unclassified_events = 1; - return t; } - + + protected String alleleString(Genotype g, int n) { if ( g.getFWDAlleles().get(n).length() == 0 ) return star; return g.getFWDAlleles().get(n); @@ -179,7 +231,6 @@ public class MendelianInheritanceWalker extends RefWalker= CONS_CUTOFF ; - else { - if ( g.isSNP() ) return g.getVariantConfidence() >= SNP_CUTOFF; - else return Math.max(g.getConsensusConfidence(),g.getVariantConfidence()) >= INDEL_CUTOFF; + + if ( g.isReference() ) { + if ( g.isPointGenotype() ) return g.getConsensusConfidence() >= POINT_CONS_CUTOFF ; + else return g.getConsensusConfidence() >= INDEL_CONS_CUTOFF ; + } + else { // it's a variant + if ( g.isPointGenotype() ) return g.getVariantConfidence() >= POINT_VAR_CUTOFF ; + else return g.getVariantConfidence() >= INDEL_VAR_CUTOFF ; + } + + } + + private static class DefaultIndelGenotype implements Genotype { + private GenomeLoc location; + private static final List alleles = Arrays.asList("","") ; + + DefaultIndelGenotype(GenomeLoc l) { + location = l; + } + + @Override + public double getConsensusConfidence() { + return 1000; + } + + @Override + public List getFWDAlleles() { + return alleles; + } + + @Override + public String getFWDRefBases() { + return alleles.get(0); + } + + @Override + public GenomeLoc getLocation() { + return location; + } + + @Override + public char getRef() { + return '*'; + } + + @Override + public double getVariantConfidence() { + return 0.0; + } + + @Override + public boolean isBiallelic() { + return false; + } + + @Override + public boolean isDeletion() { + return false; + } + + @Override + public boolean isHet() { + return false; + } + + @Override + public boolean isHom() { + return true; + } + + @Override + public boolean isIndel() { + return false; + } + + @Override + public boolean isIndelGenotype() { + return true; + } + + @Override + public boolean isInsertion() { + return false; + } + + @Override + public boolean isPointGenotype() { + return false; + } + + @Override + public boolean isReference() { + return true; + } + + @Override + public boolean isSNP() { + return false; + } + + @Override + public int compareTo(ReferenceOrderedDatum o) { + return location.compareTo(o.getLocation()); } }