2010-06-09 04:47:01 +08:00
package org.broadinstitute.sting.oneoffprojects.walkers ;
import org.broad.tribble.vcf.VCFHeader ;
import org.broad.tribble.vcf.VCFHeaderLine ;
import org.broadinstitute.sting.commandline.Argument ;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext ;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext ;
2010-06-09 22:21:19 +08:00
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext ;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele ;
2010-06-09 04:47:01 +08:00
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype ;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext ;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker ;
import org.broadinstitute.sting.gatk.walkers.LocusWalker ;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection ;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine ;
2010-06-09 22:21:19 +08:00
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext ;
2010-06-09 04:47:01 +08:00
import org.broadinstitute.sting.gatk.walkers.varianteval.MendelianViolationEvaluator ;
import org.broadinstitute.sting.utils.GenomeLoc ;
2010-06-19 03:32:24 +08:00
import org.broadinstitute.sting.utils.GenomeLocParser ;
2010-06-09 04:47:01 +08:00
import org.broadinstitute.sting.utils.SampleUtils ;
import org.broadinstitute.sting.utils.StingException ;
2010-06-19 03:32:24 +08:00
import org.broadinstitute.sting.utils.collections.Pair ;
2010-06-09 04:47:01 +08:00
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils ;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter ;
2010-06-09 22:21:19 +08:00
import org.broadinstitute.sting.utils.pileup.PileupElement ;
2010-06-09 04:47:01 +08:00
2010-06-09 22:21:19 +08:00
import java.io.PrintStream ;
2010-06-09 04:47:01 +08:00
import java.util.* ;
/ * *
* Takes in a VCF file for a trio ( and optionally , bam files for some or all members ) and classifies mendelian violations
* as deNovo events , or opposite homozygotes , and includes additional information pertaining to event QC ( size of
* homozygous region in child and / or parents , whether a site is likely tri - allelic or not , etc )
*
* @Author chartl
* @Date Jun 8 , 2010
* /
public class MendelianViolationClassifier extends LocusWalker < MendelianViolationClassifier . MendelianViolation , VCFWriter > {
@Argument ( shortName = "f" , fullName = "familyPattern" , required = true , doc = "Pattern for the family structure (usage: mom+dad=child)" )
String familyStr = null ;
2010-06-09 22:21:19 +08:00
@Argument ( shortName = "ob" , fullName = "outputBed" , required = true , doc = "Output file to write the homozygous region information to" )
PrintStream bedOutput = null ;
2010-06-19 03:32:24 +08:00
@Argument ( fullName = "deNovoTriAllelicQ" , required = false , doc = "Cutoff for quality scores of 3rd allele at denovo mendelian violations to remove it from the denovo set; e.g. Q40 = need Q40 evidence for 3rd allele to toss out deNovo" )
int deNovoTriQ = 20 ;
@Argument ( fullName = "deNovoParentalAllele" , required = false , doc = "Range for the parental allele at denovo sites to be kept in denovo set, e.g. 0.4-0.6 will toss out denovo sites with parental allele proportions of <0.4 and >0.6" )
String deNovoParentalAllele = "-0.1-1.1" ;
@Argument ( fullName = "oppositeHomozygoteTriAllelicQ" , required = false , doc = "Cutoff for quality scores of 3rd allele at opposite homozygote sites to remove it from the violation set" )
int opHomTriQ = 20 ;
2010-06-23 23:49:13 +08:00
@Argument ( fullName = "oppositeHomozygoteAlleleProportion" , required = false , doc = "Range for the parental allele in the parents at opposite homozygote sites for it to be kept in violation set" )
String opHomAlleleProp = "-0.1-1.1" ;
2010-06-19 03:32:24 +08:00
2010-06-09 04:47:01 +08:00
/ *
2010-06-09 22:21:19 +08:00
* * * * * * * * * * * PRIVATE CLASSES
2010-06-09 04:47:01 +08:00
* /
public class ExtendedTrioStructure extends MendelianViolationEvaluator . TrioStructure {
public HashMap < String , HomozygosityRegion > homozygousRegions ;
public HashMap < String , Integer > homozygousRegionCounts ;
2010-06-09 22:21:19 +08:00
public HashMap < String , MendelianInfoKey > regionKeys ;
2010-06-09 04:47:01 +08:00
public ExtendedTrioStructure ( String family ) {
MendelianViolationEvaluator . TrioStructure struct = MendelianViolationEvaluator . parseTrioDescription ( family ) ;
this . child = struct . child ;
this . mom = struct . mom ;
this . dad = struct . dad ;
homozygousRegions = new HashMap < String , HomozygosityRegion > ( 3 ) ;
homozygousRegionCounts = new HashMap < String , Integer > ( 3 ) ;
homozygousRegions . put ( child , null ) ;
homozygousRegions . put ( mom , null ) ;
homozygousRegions . put ( dad , null ) ;
homozygousRegionCounts . put ( child , 0 ) ;
homozygousRegionCounts . put ( mom , 0 ) ;
2010-06-10 02:05:37 +08:00
homozygousRegionCounts . put ( dad , 0 ) ;
2010-06-09 22:21:19 +08:00
regionKeys = new HashMap < String , MendelianInfoKey > ( 3 ) ;
regionKeys . put ( child , MendelianInfoKey . ChildHomozygosityRegion ) ;
regionKeys . put ( mom , MendelianInfoKey . MotherHomozygosityRegion ) ;
regionKeys . put ( dad , MendelianInfoKey . FatherHomozygosityRegion ) ;
}
public void updateHomozygosityRegions ( MendelianViolation v , PrintStream output ) {
if ( ! v . siteIsFiltered ( ) ) {
ArrayList < HomozygosityRegion > brokenRegions = new ArrayList < HomozygosityRegion > ( 3 ) ;
// can only enter or break regions at unfiltered calls
for ( Map . Entry < String , Genotype > memberGenotype : v . getUnderlyingGenotypes ( ) . entrySet ( ) ) {
// for each family member
if ( homozygousRegions . get ( memberGenotype . getKey ( ) ) = = null ) {
// currently in a heterozygous region, update if possible
if ( memberGenotype . getValue ( ) . isHom ( ) ) {
homozygousRegionCounts . put ( memberGenotype . getKey ( ) , homozygousRegionCounts . get ( memberGenotype . getKey ( ) ) + 1 ) ;
homozygousRegions . put ( memberGenotype . getKey ( ) , new HomozygosityRegion ( v . getLocus ( ) ) ) ;
if ( v . type ! = MendelianViolationType . NONE ) {
v . addAttribute ( regionKeys . get ( memberGenotype . getKey ( ) ) . getKey ( ) , homozygousRegionCounts . get ( memberGenotype . getKey ( ) ) ) ;
}
}
} else {
// potentially breaking a homozygous region
if ( memberGenotype . getValue ( ) . isHom ( ) ) {
// no break, update the region
HomozygosityRegion r = homozygousRegions . get ( memberGenotype . getKey ( ) ) ;
r . lastSeen = v . getLocus ( ) ;
r . callsWithinRegion + + ;
2010-06-23 23:49:13 +08:00
if ( v . type ! = MendelianViolationType . NONE & & ! v . violationIsFiltered ( ) ) {
2010-06-09 22:21:19 +08:00
v . addAttribute ( regionKeys . get ( memberGenotype . getKey ( ) ) . getKey ( ) , homozygousRegionCounts . get ( memberGenotype . getKey ( ) ) ) ;
if ( v . type = = MendelianViolationType . DE_NOVO_SNP ) {
r . deNovoSNPsInRegion + + ;
} else if ( v . type = = MendelianViolationType . OPPOSITE_HOMOZYGOTE ) {
r . oppositeHomsInRegion + + ;
}
}
} else if ( memberGenotype . getValue ( ) . isHet ( ) ) {
// explicitly check for hets -- no calls are not counted -- this breaks a region so we print it
homozygousRegions . get ( memberGenotype . getKey ( ) ) . finalize ( v . getLocus ( ) , memberGenotype . getKey ( ) , homozygousRegionCounts . get ( memberGenotype . getKey ( ) ) ) ;
brokenRegions . add ( homozygousRegions . get ( memberGenotype . getKey ( ) ) ) ;
homozygousRegions . put ( memberGenotype . getKey ( ) , null ) ;
}
}
}
if ( brokenRegions . size ( ) > 0 ) {
Collections . sort ( brokenRegions ) ;
}
for ( HomozygosityRegion r : brokenRegions ) {
output . printf ( "%s%n" , r ) ;
}
}
2010-06-09 04:47:01 +08:00
}
}
2010-06-09 22:21:19 +08:00
public class HomozygosityRegion implements Comparable {
2010-06-09 04:47:01 +08:00
public GenomeLoc regionStart ;
2010-06-09 22:21:19 +08:00
public GenomeLoc lastSeen ;
public GenomeLoc endedBy ;
2010-06-09 04:47:01 +08:00
public int callsWithinRegion ;
2010-06-09 22:21:19 +08:00
public int oppositeHomsInRegion ;
public int deNovoSNPsInRegion ;
private String parent ;
private int id ;
2010-06-09 04:47:01 +08:00
public HomozygosityRegion ( GenomeLoc start ) {
regionStart = start ;
2010-06-09 22:21:19 +08:00
lastSeen = start ;
endedBy = null ;
2010-06-09 04:47:01 +08:00
callsWithinRegion = 0 ;
2010-06-09 22:21:19 +08:00
oppositeHomsInRegion = 0 ;
deNovoSNPsInRegion = 0 ;
}
public void finalize ( GenomeLoc regionEnd , String parent , int id ) {
endedBy = regionEnd ;
this . parent = parent ;
this . id = id ;
2010-06-09 04:47:01 +08:00
}
2010-06-09 22:21:19 +08:00
private int getSizeLowerBound ( ) {
return lastSeen . distance ( regionStart ) ;
}
private int getSizeOfFirstHomToFirstHet ( ) {
return endedBy . distance ( regionStart ) ;
}
public String toString ( ) {
2010-06-19 03:32:24 +08:00
return String . format ( "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d" , regionStart . getContig ( ) , regionStart . getStart ( ) ,
lastSeen . getStart ( ) , endedBy . getStart ( ) , parent , id , getSizeLowerBound ( ) , getSizeOfFirstHomToFirstHet ( ) ,
callsWithinRegion , oppositeHomsInRegion , deNovoSNPsInRegion ) ;
2010-06-09 22:21:19 +08:00
}
public int compareTo ( Object o ) {
if ( ! ( o instanceof HomozygosityRegion ) ) {
return Integer . MIN_VALUE ;
}
return this . regionStart . compareTo ( ( ( HomozygosityRegion ) o ) . regionStart ) ;
}
2010-06-19 03:32:24 +08:00
public String getContigStr ( ) {
return regionStart . getContig ( ) ;
}
2010-06-09 04:47:01 +08:00
}
public class MendelianViolation {
private VariantContext trio ;
public MendelianViolationType type ;
private HashMap < String , Object > newAttributes ;
2010-06-09 22:21:19 +08:00
private HashMap < String , Integer > homozygosityRegions ;
2010-06-23 23:49:13 +08:00
private boolean filtered = false ;
2010-06-09 04:47:01 +08:00
public MendelianViolation ( VariantContext context , MendelianViolationType violationType ) {
trio = context ;
type = violationType ;
newAttributes = new HashMap < String , Object > ( ) ;
newAttributes . put ( MendelianInfoKey . ViolationType . getKey ( ) , type ) ;
2010-06-09 22:21:19 +08:00
homozygosityRegions = new HashMap < String , Integer > ( 3 ) ;
2010-06-09 04:47:01 +08:00
}
public void addAttribute ( String key , Object val ) {
newAttributes . put ( key , val ) ;
}
public VariantContext toVariantContext ( ) {
newAttributes . putAll ( trio . getAttributes ( ) ) ;
2010-07-21 02:01:45 +08:00
return new VariantContext ( trio . getName ( ) , trio . getLocation ( ) , trio . getAlleles ( ) , trio . getGenotypes ( ) , trio . getNegLog10PError ( ) , trio . filtersWereApplied ( ) ? trio . getFilters ( ) : null , newAttributes ) ;
2010-06-09 04:47:01 +08:00
}
public boolean siteIsFiltered ( ) {
return trio . isFiltered ( ) ;
}
2010-06-09 22:21:19 +08:00
public void addRegions ( HashMap < String , Integer > regionIDsByName ) {
for ( Map . Entry < String , Integer > e : regionIDsByName . entrySet ( ) ) {
setRegion ( e . getKey ( ) , e . getValue ( ) ) ;
}
}
public void setRegion ( String parent , int regionID ) {
homozygosityRegions . put ( parent , regionID ) ;
}
public boolean isInPreviousRegions ( Map < String , Integer > otherRegions ) {
for ( String s : otherRegions . keySet ( ) ) {
if ( homozygosityRegions . get ( s ) > = otherRegions . get ( s ) ) {
return false ;
}
}
return true ;
}
public Map < String , Genotype > getUnderlyingGenotypes ( ) {
return trio . getGenotypes ( ) ;
}
public GenomeLoc getLocus ( ) {
return trio . getLocation ( ) ;
}
public byte getRefBase ( ) {
return trio . getReference ( ) . getBases ( ) [ 0 ] ;
}
public Object getAttribute ( String key ) {
if ( newAttributes . keySet ( ) . contains ( key ) ) {
return newAttributes . get ( key ) ;
} else {
return trio . getAttribute ( key ) ;
}
}
2010-06-23 23:49:13 +08:00
public void filter ( ) {
filtered = true ;
newAttributes . put ( MendelianInfoKey . ViolationType . getKey ( ) , "Filtered_" + newAttributes . get ( MendelianInfoKey . ViolationType . getKey ( ) ) ) ;
}
public String getType ( ) {
return filtered ? "Filtered_" + type . toString ( ) : type . toString ( ) ;
}
public boolean violationIsFiltered ( ) {
return filtered ;
}
2010-06-09 04:47:01 +08:00
}
2010-06-19 03:32:24 +08:00
public class Range {
private double upper ;
private double lower ;
private double epsilon = 10 e - 3 ;
Range ( String rangeStr ) {
String rs = rangeStr . substring ( 0 ) ; // don't clobber original string
boolean startIsNegative = rangeStr . startsWith ( "-" ) ;
if ( startIsNegative ) {
rs = rs . substring ( 1 ) ;
}
String [ ] lu = rs . split ( "-" ) ;
lower = startIsNegative ? - 1 * Double . parseDouble ( lu [ 0 ] ) : Double . parseDouble ( lu [ 0 ] ) ;
upper = Double . parseDouble ( lu [ 1 ] ) ;
//System.out.printf("Lower: %.2f, Upper: %.2f",lower,upper);
}
public boolean contains ( double p ) {
return p > lower - epsilon & & p < upper + epsilon ;
}
}
2010-06-09 04:47:01 +08:00
/ *
2010-06-09 22:21:19 +08:00
* * * * * * * * * * * * * * * PRIVATE ENUMS
2010-06-09 04:47:01 +08:00
* /
public enum MendelianViolationType {
OPPOSITE_HOMOZYGOTE ( "oppositeHomozygote" ) ,
DE_NOVO_SNP ( "deNovoSNP" ) ,
NONE ( "none" ) ;
private String infoString ;
MendelianViolationType ( String typeName ) {
infoString = typeName ;
}
public String toString ( ) {
2010-06-23 23:49:13 +08:00
return infoString ;
2010-06-09 04:47:01 +08:00
}
}
public enum MendelianInfoKey {
2010-06-29 02:33:23 +08:00
ViolationType ( "MVT" , "String" , 1 , "\"The Mendelian violation type\"" ) ,
ParentalDeletion ( "deletedParent" , "String" , 1 , "\"The parent from whom the child (putatively) inherited a deletion at opposite homozygous sites\"" ) ,
TriAllelicQuality ( "TriAlQ" , "Integer" , 1 , "\"The variant quality of the third allele at putative tri-allelic sites\"" ) ,
TriAllelicBase ( "TriAlB" , "String" , 1 , "\"The third allele at putative tri-allelic sites\"" ) ,
MotherHomozygosityRegion ( "MHR" , "String" , 1 , "\"An identifier for the mother's homozygosity region where the violation is located\"" ) ,
FatherHomozygosityRegion ( "FHR" , "String" , 1 , "\"An identifier for the father's homozygosity region where the violation is located\"" ) ,
ChildHomozygosityRegion ( "CHR" , "Integer" , 1 , "\"An identifier for the child's homozygosity region where the violation is located\"" ) ,
ProportionOfParentAllele ( "PropParent" , "Float" , 1 , "\"The proportion of bases in the child that were the parent allele at deNovo SNP sites\"" ) ,
2010-06-09 22:21:19 +08:00
/* ***************************************** UNUSED ************************************************ */
2010-06-29 02:33:23 +08:00
NumCallsInRegion ( "REGION_NCALLS" , "Integer" , 1 , "\"The number of unfiltered SNP calls found in the homozygosity region\"" ) ,
ChildHomozygosityRegionSize ( "CHRS" , "Integer" , 1 , "\"The size of the region of homozygosity in the child in which the opposite homozygote is located\"" ) ,
OppositeHomozygotesInRegion ( "CHROH" , "Integer" , 1 , "\"The number of opposite-homozygotes located in the region of homozygosity\"" ) ,
ParentHomozygosityRegionSize ( "PHRS" , "Integer" , 1 , "\"The size of the parental homozygosity region where the deNovo SNP is located\"" ) ,
DeNovoSNPsInRegion ( "PHRDN" , "Integer" , 1 , "\"The number of deNovo SNP events located in the same region of parental homozygosity where the deNovo SNP is located\"" ) ;
2010-06-09 04:47:01 +08:00
String keyName ;
String valueType ;
String valueDescription ;
2010-06-29 02:33:23 +08:00
int numFields ;
2010-06-09 04:47:01 +08:00
2010-06-29 02:33:23 +08:00
MendelianInfoKey ( String keyStr , String infoType , int fields , String description ) {
2010-06-09 04:47:01 +08:00
keyName = keyStr ;
valueType = infoType ;
valueDescription = description ;
2010-06-29 02:33:23 +08:00
numFields = fields ;
2010-06-09 04:47:01 +08:00
}
public String toString ( ) {
2010-06-29 02:33:23 +08:00
return String . format ( "%s,%s,%d,%s" , keyName , valueType , numFields , valueDescription ) ;
2010-06-09 04:47:01 +08:00
}
public String getKey ( ) {
return keyName ;
}
}
/ *
2010-06-09 22:21:19 +08:00
* * * * * * * * * * * * * * * * PRIVATE DATA
2010-06-09 04:47:01 +08:00
* /
private ExtendedTrioStructure trioStructure ;
private UnifiedGenotyperEngine engine ;
2010-06-19 03:32:24 +08:00
private Range deNovoRange ;
2010-06-23 23:49:13 +08:00
private Range opHomRange ;
2010-06-09 04:47:01 +08:00
/ *
2010-06-09 22:21:19 +08:00
* * * * * * * * * * * * * * * * * INITIALIZE
2010-06-09 04:47:01 +08:00
* /
public void initialize ( ) {
trioStructure = new ExtendedTrioStructure ( familyStr ) ;
2010-06-19 03:32:24 +08:00
deNovoRange = new Range ( deNovoParentalAllele ) ;
2010-06-23 23:49:13 +08:00
opHomRange = new Range ( opHomAlleleProp ) ;
2010-06-09 04:47:01 +08:00
UnifiedArgumentCollection uac = new UnifiedArgumentCollection ( ) ;
uac . MIN_BASE_QUALTY_SCORE = 10 ;
uac . MIN_MAPPING_QUALTY_SCORE = 10 ;
2010-06-19 03:32:24 +08:00
uac . STANDARD_CONFIDENCE_FOR_CALLING = Math . min ( deNovoTriQ , opHomTriQ ) ;
2010-06-09 04:47:01 +08:00
engine = new UnifiedGenotyperEngine ( getToolkit ( ) , uac ) ;
2010-06-09 22:21:19 +08:00
logger . info ( "Mom: " + trioStructure . mom + " Dad: " + trioStructure . dad + " Child: " + trioStructure . child ) ;
2010-06-10 02:05:37 +08:00
bedOutput . printf ( "%s%n" , getBedFileHeader ( ) ) ;
2010-06-09 04:47:01 +08:00
}
/ *
* * * * * * * * * * * REDUCE INIT
* /
public VCFWriter reduceInit ( ) {
VCFWriter writer = new VCFWriter ( out ) ;
Set < VCFHeaderLine > hInfo = new HashSet < VCFHeaderLine > ( ) ;
hInfo . addAll ( VCFUtils . getHeaderFields ( getToolkit ( ) ) ) ;
2010-06-23 23:49:13 +08:00
hInfo . add ( new VCFHeaderLine ( "source" , "MendelianViolationClassifier" ) ) ;
2010-06-09 04:47:01 +08:00
for ( MendelianInfoKey key : EnumSet . allOf ( MendelianInfoKey . class ) ) {
hInfo . add ( new VCFHeaderLine ( "INFO" , key . toString ( ) ) ) ;
}
VCFHeader vcfHeader = new VCFHeader ( hInfo , SampleUtils . getUniqueSamplesFromRods ( getToolkit ( ) ) ) ;
writer . writeHeader ( vcfHeader ) ;
return writer ;
}
2010-06-10 02:05:37 +08:00
/ *
* * * * * * * * * * * * * * * * * FILTER
* /
public boolean filter ( RefMetaDataTracker tracker , ReferenceContext ref , AlignmentContext context ) {
return tracker ! = null ;
}
2010-06-09 04:47:01 +08:00
/ *
2010-06-09 22:21:19 +08:00
* * * * * * * * * * * * * * * MAP
2010-06-09 04:47:01 +08:00
* /
public MendelianViolation map ( RefMetaDataTracker tracker , ReferenceContext ref , AlignmentContext context ) {
2010-06-10 02:05:37 +08:00
return assessViolation ( tracker . getVariantContext ( ref , "trio" , EnumSet . of ( VariantContext . Type . SNP ) , ref . getLocus ( ) , true ) , tracker , ref , context ) ;
}
private boolean isComplete ( VariantContext vc ) {
for ( Genotype g : vc . getGenotypes ( ) . values ( ) ) {
if ( g . isNoCall ( ) | | g . isFiltered ( ) ) {
return false ;
}
2010-06-09 04:47:01 +08:00
}
2010-06-10 02:05:37 +08:00
return true ;
2010-06-09 04:47:01 +08:00
}
private MendelianViolation assessViolation ( VariantContext varContext , RefMetaDataTracker tracker , ReferenceContext reference , AlignmentContext context ) {
MendelianViolation violation ;
2010-06-09 22:21:19 +08:00
if ( varContext ! = null ) {
2010-06-10 02:05:37 +08:00
if ( isComplete ( varContext ) & & MendelianViolationEvaluator . isViolation ( varContext , trioStructure ) ) {
2010-06-09 22:21:19 +08:00
if ( isDeNovo ( varContext ) ) {
violation = assessDeNovo ( varContext , tracker , reference , context ) ;
} else if ( isOppositeHomozygote ( varContext ) ) {
violation = assessOppositeHomozygote ( varContext , tracker , reference , context ) ;
} else {
throw new StingException ( "Mendelian violation that is neither deNovo nor opposite homozygote. Should never see this." ) ;
}
2010-06-09 04:47:01 +08:00
} else {
2010-06-09 22:21:19 +08:00
violation = new MendelianViolation ( varContext , MendelianViolationType . NONE ) ;
2010-06-09 04:47:01 +08:00
}
2010-06-19 03:32:24 +08:00
2010-06-09 04:47:01 +08:00
} else {
2010-06-09 22:21:19 +08:00
violation = null ;
2010-06-09 04:47:01 +08:00
}
return violation ;
}
// Note, mendelian violation is guaranteed at this point; deNovo only happens at child het sites
private boolean isDeNovo ( VariantContext trio ) {
return trio . getGenotype ( trioStructure . child ) . isHet ( ) ;
}
// Note, mendelian violation is guaranteed at this point; technically we do not have to check this...but we are
private boolean isOppositeHomozygote ( VariantContext trio ) {
if ( trio . getGenotype ( trioStructure . child ) . isHet ( ) ) { // not valid at child het sites
return false ;
} else if ( trio . getHetCount ( ) > 1 ) { // child is not het, so if this is 2, mom and dad are both het, invalid
return false ;
} else if ( trio . getGenotype ( trioStructure . dad ) = = null | | trio . getGenotype ( trioStructure . mom ) = = null ) {
return false ;
}
return true ;
}
2010-06-09 22:21:19 +08:00
/ *
* * * * * * * * * * * * ASSESS DE NOVO AND OPPOSITE HOMOZYGOTES
* /
2010-06-09 04:47:01 +08:00
private MendelianViolation assessDeNovo ( VariantContext trio , RefMetaDataTracker tracker , ReferenceContext ref , AlignmentContext context ) {
2010-06-09 22:21:19 +08:00
MendelianViolation violation = new MendelianViolation ( trio , MendelianViolationType . DE_NOVO_SNP ) ;
// look for mis-genotyped sites by examining the proportion of the parent-allele bases in the child --
// as a spoeedup we do this only at non-filtered sites
if ( ! trio . isFiltered ( ) ) {
Allele parental = trio . getGenotype ( trioStructure . mom ) . getAllele ( 0 ) ; // guaranteed homozygous
2010-06-10 02:05:37 +08:00
if ( parental . getBases ( ) . length < 1 ) {
throw new StingException ( "Parental bases have length zero at " + trio . toString ( ) ) ;
}
2010-06-19 03:32:24 +08:00
Map < String , StratifiedAlignmentContext > splitContext = StratifiedAlignmentContext . splitContextBySample ( context . getBasePileup ( ) ) ;
Double proportion = getAlleleProportion ( parental , splitContext . get ( trioStructure . child ) ) ;
if ( proportion ! = null ) {
violation . addAttribute ( MendelianInfoKey . ProportionOfParentAllele . getKey ( ) , proportion ) ;
if ( ! deNovoRange . contains ( proportion ) ) {
2010-06-23 23:49:13 +08:00
//System.out.println("Filtering deNovo by proportion: is "+proportion+" should be in range "+deNovoRange.lower+"-"+deNovoRange.upper);
violation . filter ( ) ;
2010-06-19 03:32:24 +08:00
}
}
Pair < Allele , Integer > triAl = getTriAllelicQuality ( tracker , ref , trio , splitContext ) ;
if ( triAl ! = null ) {
violation . addAttribute ( MendelianInfoKey . TriAllelicBase . getKey ( ) , triAl . first . toString ( ) ) ;
violation . addAttribute ( MendelianInfoKey . TriAllelicQuality . getKey ( ) , triAl . second ) ;
if ( triAl . second > = deNovoTriQ ) {
2010-06-23 23:49:13 +08:00
violation . filter ( ) ;
2010-06-09 22:21:19 +08:00
}
}
2010-06-19 03:32:24 +08:00
} else {
2010-06-23 23:49:13 +08:00
violation . filter ( ) ;
2010-06-09 22:21:19 +08:00
}
2010-06-19 03:32:24 +08:00
2010-06-09 22:21:19 +08:00
return violation ;
2010-06-09 04:47:01 +08:00
}
private MendelianViolation assessOppositeHomozygote ( VariantContext trio , RefMetaDataTracker tracker , ReferenceContext ref , AlignmentContext context ) {
2010-06-09 22:21:19 +08:00
MendelianViolation violation = new MendelianViolation ( trio , MendelianViolationType . OPPOSITE_HOMOZYGOTE ) ;
logger . debug ( getParentalDeletion ( trio ) ) ;
violation . addAttribute ( MendelianInfoKey . ParentalDeletion . getKey ( ) , getParentalDeletion ( trio ) ) ;
// look for tri-allelic sites mis-called as hom -- as a speedup we do this only at non-filtered, non genotype error sites
2010-06-19 03:32:24 +08:00
if ( ! trio . isFiltered ( ) ) {
Map < String , StratifiedAlignmentContext > splitCon = StratifiedAlignmentContext . splitContextBySample ( context . getBasePileup ( ) ) ;
Pair < Allele , Integer > triAl = getTriAllelicQuality ( tracker , ref , trio , splitCon ) ;
if ( triAl ! = null ) {
violation . addAttribute ( MendelianInfoKey . TriAllelicBase . getKey ( ) , triAl . first . toString ( ) ) ;
violation . addAttribute ( MendelianInfoKey . TriAllelicQuality . getKey ( ) , triAl . second ) ;
if ( triAl . second > = opHomTriQ ) {
2010-06-23 23:49:13 +08:00
violation . filter ( ) ;
2010-06-09 22:21:19 +08:00
}
}
2010-06-19 03:32:24 +08:00
2010-06-23 23:49:13 +08:00
Double childProp = getAlleleProportion ( trio . getGenotype ( trioStructure . child ) . getAllele ( 0 ) , splitCon . get ( trioStructure . child ) ) ;
2010-06-19 03:32:24 +08:00
Double motherProp = getAlleleProportion ( trio . getGenotype ( trioStructure . mom ) . getAllele ( 0 ) , splitCon . get ( trioStructure . mom ) ) ;
2010-06-23 23:49:13 +08:00
Double fatherProp = getAlleleProportion ( trio . getGenotype ( trioStructure . dad ) . getAllele ( 0 ) , splitCon . get ( trioStructure . dad ) ) ;
2010-06-19 03:32:24 +08:00
if ( childProp ! = null ) {
violation . addAttribute ( MendelianInfoKey . ProportionOfParentAllele . getKey ( ) , childProp ) ;
2010-06-23 23:49:13 +08:00
if ( ! opHomRange . contains ( childProp ) ) {
violation . filter ( ) ;
2010-06-19 03:32:24 +08:00
}
}
2010-06-23 23:49:13 +08:00
if ( motherProp ! = null & & ! opHomRange . contains ( motherProp ) ) {
violation . filter ( ) ;
2010-06-09 22:21:19 +08:00
}
2010-06-19 03:32:24 +08:00
2010-06-23 23:49:13 +08:00
if ( fatherProp ! = null & & ! opHomRange . contains ( fatherProp ) ) {
violation . filter ( ) ;
2010-06-19 03:32:24 +08:00
}
} else {
2010-06-23 23:49:13 +08:00
violation . filter ( ) ;
2010-06-09 22:21:19 +08:00
}
return violation ;
}
2010-07-01 10:46:05 +08:00
private Double getAlleleProportion ( Allele a , StratifiedAlignmentContext context ) {
2010-06-19 03:32:24 +08:00
int numParental = 0 ;
int total = 0 ;
if ( context ! = null ) {
2010-07-01 10:46:05 +08:00
for ( PileupElement e : context . getContext ( StratifiedAlignmentContext . StratifiedContextType . COMPLETE ) . getBasePileup ( ) ) {
2010-06-19 03:32:24 +08:00
if ( e . getQual ( ) > = 10 & & e . getMappingQual ( ) > = 10 ) {
total + + ;
if ( e . getBase ( ) = = a . getBases ( ) [ 0 ] ) {
numParental + + ;
}
}
}
return ( ( double ) numParental ) / total ;
} else {
return null ;
}
}
private Pair < Allele , Integer > getTriAllelicQuality ( RefMetaDataTracker tracker , ReferenceContext ref , VariantContext var , Map < String , StratifiedAlignmentContext > strat ) {
int conf = 0 ;
Allele alt = null ;
for ( Map . Entry < String , StratifiedAlignmentContext > sEntry : strat . entrySet ( ) ) {
VariantCallContext call = engine . runGenotyper ( tracker , ref , sEntry . getValue ( ) . getContext ( StratifiedAlignmentContext . StratifiedContextType . COMPLETE ) ) ;
if ( call ! = null & & call . confidentlyCalled & & call . vc ! = null ) {
if ( call . vc . isSNP ( ) ) {
if ( ! call . vc . getAlternateAllele ( 0 ) . basesMatch ( var . getAlternateAllele ( 0 ) ) ) {
if ( alt = = null ) {
alt = call . vc . getAlternateAllele ( 0 ) ;
conf = ( int ) Math . floor ( 10 * call . vc . getNegLog10PError ( ) ) ;
} else {
conf + = ( int ) Math . floor ( 10 * call . vc . getNegLog10PError ( ) ) ;
}
}
}
}
}
if ( alt = = null ) {
return null ;
} else {
return new Pair < Allele , Integer > ( alt , conf ) ;
}
}
2010-06-09 22:21:19 +08:00
private String getParentalDeletion ( VariantContext trio ) {
// case 1, mom and dad are both hom, so missing is the one whose alleles don't match the child
if ( trio . getGenotype ( trioStructure . mom ) . isHom ( ) & & trio . getGenotype ( trioStructure . dad ) . isHom ( ) ) {
if ( trio . getGenotype ( trioStructure . mom ) . isHomRef ( ) = = trio . getGenotype ( trioStructure . child ) . isHomRef ( ) ) {
// mom and child both hom ref or hom var
return trioStructure . dad ;
} else if ( trio . getGenotype ( trioStructure . dad ) . isHomRef ( ) = = trio . getGenotype ( trioStructure . child ) . isHomRef ( ) ) {
return trioStructure . mom ;
} else {
// child matches neither parent
return "genotypeError" ;
}
} else { // case 2, either mom or dad is het - the hom must be the missing allele
return trio . getGenotype ( trioStructure . mom ) . isHet ( ) ? trioStructure . dad : trioStructure . mom ;
}
2010-06-09 04:47:01 +08:00
}
/ *
2010-06-09 22:21:19 +08:00
* * * * * * * * * * * * * * * REDUCE
2010-06-09 04:47:01 +08:00
* /
public VCFWriter reduce ( MendelianViolation variant , VCFWriter writer ) {
if ( variant ! = null ) {
2010-06-09 22:21:19 +08:00
trioStructure . updateHomozygosityRegions ( variant , bedOutput ) ;
2010-07-24 04:24:03 +08:00
writer . add ( variant . toVariantContext ( ) , variant . getRefBase ( ) ) ;
2010-06-09 04:47:01 +08:00
}
return writer ;
}
/ *
* * * * * * * * * * ON TRAVERSAL DONE
* /
public void onTraversalDone ( VCFWriter writer ) {
2010-06-09 22:21:19 +08:00
Map < String , HomozygosityRegion > regions = trioStructure . homozygousRegions ;
Map < String , Integer > counts = trioStructure . homozygousRegionCounts ;
2010-06-19 03:32:24 +08:00
List < HomozygosityRegion > to_print = new ArrayList < HomozygosityRegion > ( 3 ) ;
2010-06-09 22:21:19 +08:00
for ( Map . Entry < String , HomozygosityRegion > entryRegion : regions . entrySet ( ) ) {
if ( entryRegion . getValue ( ) ! = null ) {
logger . info ( "---------------- REGION NOT FINALIZED -----------------" ) ;
logger . info ( String . format ( "%s,%s,%s,%d,%d" , entryRegion . getKey ( ) , entryRegion . getValue ( ) . regionStart , entryRegion . getValue ( ) . lastSeen ,
entryRegion . getValue ( ) . deNovoSNPsInRegion , entryRegion . getValue ( ) . oppositeHomsInRegion ) ) ;
2010-06-19 03:32:24 +08:00
int chr_end = getToolkit ( ) . getSAMFileHeader ( ) . getSequenceDictionary ( ) . getSequence ( entryRegion . getValue ( ) . getContigStr ( ) ) . getSequenceLength ( ) ;
entryRegion . getValue ( ) . endedBy = GenomeLocParser . createGenomeLoc ( entryRegion . getValue ( ) . getContigStr ( ) , chr_end , chr_end ) ;
to_print . add ( entryRegion . getValue ( ) ) ;
2010-06-09 22:21:19 +08:00
}
2010-06-09 04:47:01 +08:00
}
2010-06-19 03:32:24 +08:00
Collections . sort ( to_print ) ;
for ( HomozygosityRegion hr : to_print ) {
bedOutput . printf ( "%s%n" , hr ) ;
}
2010-06-09 04:47:01 +08:00
}
2010-06-09 22:21:19 +08:00
/ *
* * * * * * * * * * * * * * * * * STATIC METHODS
* /
public static String getBedFileHeader ( ) {
return String . format ( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" , "Chrom" , "first_seen_hom" , "last_seen_hom" , "first_seen_het" ,
"sample_name" , "region_id" , "homozygous_region_size" , "size_to_first_het" , "calls_within_region" ,
"opposite_homozygotes_in_region" , "deNovo_SNPs_in_region" ) ;
}
2010-06-09 04:47:01 +08:00
}