Initial commit of the opposite homozygote classifier. Currently does the following, given a trio vcf:

+ Identifies opposite homozygote sites
 + Identifies the parent from whom it is expected that a null allele was inherited (or whether it was a putative genotype error; e.g. mom=homref, dad=homref, child=homvar)
 + Labels each opposite homozygote with its homozygous region in the child (e.g. region 1, region 2)
 + Labels each opposite homozygote with the size of the homozygous region in which it was found, the number of child homozygotes in the region, and the number of opposite homozygote violations within that region

To come:
 + Classification of sites as likely tri-allelic


Note that this is very experimental



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3498 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-06-08 03:56:07 +00:00
parent 199e4208cd
commit 933133ee28
1 changed files with 198 additions and 0 deletions

View File

@ -0,0 +1,198 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Jun 7, 2010
*/
public class OppositeHomozygoteClassifier extends RodWalker<VariantContext,VCFWriter> {
@Argument(shortName="f",fullName="familyPattern",required=true,doc="Pattern for the family structure (usage: mom+dad=child)")
String familyStr = null;
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
private TrioStructure trioStructure; /* holds sample names of mom dad child */
private ArrayList<VariantContext> contextBuffer; /* holds contexts until they're ready to be printed */
private GenomeLoc homozygousRegionStartChild; /* start of the homozygous region for child */
private int callsWithinHomozygousRegion; /* number of calls in the current homozygous child region */
private int childHomozygousRegionCounter; /* holds number of child homozygous regions */
public void initialize() {
trioStructure = parseTrioDescription(familyStr);
homozygousRegionStartChild = null;
childHomozygousRegionCounter = 0;
callsWithinHomozygousRegion = 0;
contextBuffer = new ArrayList<VariantContext>(500);
}
public static class TrioStructure {
public String mom, dad, child;
}
public static TrioStructure parseTrioDescription(String family) {
Matcher m = FAMILY_PATTERN.matcher(family);
if (m.matches()) {
TrioStructure trio = new TrioStructure();
//System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
trio.mom = m.group(1);
trio.dad = m.group(2);
trio.child = m.group(3);
return trio;
} else {
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
}
}
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) {
return null;
}
VariantContext trioVariants = tracker.getVariantContext(ref,"trio", EnumSet.allOf(VariantContext.Type.class), ref.getLocus(), true);
// for this to work we need mismatching parents, one a homozyote, and a child homozygote
if ( trioVariants == null ) {
return null;
}
//System.out.println(": "+trioVariants.getGenotype(trioStructure.mom)+" dad: "+trioVariants.getGenotype(trioStructure.dad)+" child: "+trioVariants.getGenotype(trioStructure.child));
if ( isOppositeHomozygoteSite(trioVariants) && ! trioVariants.isFiltered()) {
// find out who the homozygote is in the parents
if ( trioVariants.getGenotype(trioStructure.mom).isHom() ) {
return assessVariant(trioStructure.mom,trioStructure.dad,trioStructure.child,trioVariants,ref,context);
} else if ( trioVariants.getGenotype(trioStructure.dad).isHom() ) {
return assessVariant(trioStructure.dad,trioStructure.mom,trioStructure.child,trioVariants,ref,context);
}
}
return trioVariants;
}
private boolean isOppositeHomozygoteSite(VariantContext trio) {
if ( trio.getGenotype(trioStructure.child).isHet() ) { // not valid at child het sites
return false;
} else if ( trio.getHetCount() > 1 ) { // child is not het, so if this is 2, mom and dad are both het, invalid
return false;
} else if ( trio.getGenotype(trioStructure.dad) == null || trio.getGenotype(trioStructure.mom) == null ) {
return false;
}
return true;
}
private VariantContext assessVariant(String homParent, String otherParent, String child, VariantContext variCon, ReferenceContext refCon, AlignmentContext aliCon) {
// see if the child matches the hom parent
HashMap<String,Object> attributes = new HashMap<String,Object>(variCon.getAttributes());
//System.out.println(refCon.getLocus()+" homParent: "+variCon.getGenotype(homParent).getGenotypeString()+" otherParent: "+variCon.getGenotype(otherParent).getGenotypeString()+" child: "+variCon.getGenotype(child).getGenotypeString());
//do child and hom parent NOT match genotypes?
if ( variCon.getGenotype(child).isHomRef() && variCon.getGenotype(homParent).isHomVar() ||
variCon.getGenotype(child).isHomVar() && variCon.getGenotype(homParent).isHomRef() ) {
// check for genotyping error (other must be het, or opposite of first parent)
if ( variCon.getGenotype(otherParent).isHet() || variCon.getGenotype(otherParent).isHomRef() != variCon.getGenotype(homParent).isHomRef() ) {
attributes.put("opHom",homParent);
} else {
attributes.put("opHom","genotypeError");
}
} else if ( variCon.getGenotype(otherParent).isHom() && variCon.getGenotype(otherParent).isHomRef() != variCon.getGenotype(homParent).isHomRef() ) {
// is other parent both homozygous and different?
attributes.put("opHom",otherParent);
}
// todo -- assessment of site based on alignment contest (tri allelic? etc)
return new VariantContext(variCon.getName(), variCon.getLocation(), variCon.getAlleles(), variCon.getGenotypes(), variCon.getNegLog10PError(), variCon.getFilters(), attributes);
}
public VCFWriter reduceInit() {
VCFWriter writer = new VCFWriter(out);
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "OppositeHomozygoteClassifier"));
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
//hInfo.add(new VCFHeaderLine("opHom","Child tentatively inheritied the NULL ALLELE from this parent"));
// todo -- add info field annotation lines: "opHom", "CHR", "CHRS"
VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit()));
writer.writeHeader(vcfHeader);
return writer;
}
public VCFWriter reduce(VariantContext variCon, VCFWriter writer) {
if ( variCon == null ) {
return writer;
}
if ( homozygosityRegionIsBroken(variCon) && contextBuffer.size() > 0 ) {
outputBufferedRecords(contextBuffer,variCon,writer);
} else if ( ! variCon.isFiltered() && ! variCon.getGenotype(trioStructure.child).isNoCall() ) {
callsWithinHomozygousRegion++;
}
if ( variCon.hasAttribute("opHom") ) {
//writer.addRecord(VariantContextAdaptors.toVCF(variCon, variCon.getReference().getBases()[0]));
contextBuffer.add(variCon);
}
if ( homozygousRegionStartChild == null ) {
if ( variCon.getGenotype(trioStructure.child).isHom() && ! variCon.isFiltered() ) {
homozygousRegionStartChild = variCon.getLocation();
}
}
return writer;
}
private boolean homozygosityRegionIsBroken(VariantContext context) {
// check to see if either the parent or child homozygosity regions have been broken
if ( homozygousRegionStartChild != null && context.getGenotype(trioStructure.child).isHet() && ! context.isFiltered() ) {
// NOTE: NO CALLS DO NOT BREAK REGIONS OF HOMOZYGOSITY
return true;
}
return false;
}
private void outputBufferedRecords(List<VariantContext> bufCon, VariantContext varCon, VCFWriter writer) {
// the buffered contexts all share one feature -- come from the same child homozygosity region
String regionSize;
if ( varCon != null ) {
regionSize = Integer.toString(varCon.getLocation().distance(homozygousRegionStartChild));
} else {
regionSize = "unknown";
}
for ( VariantContext vc : bufCon ) {
HashMap<String,Object> attributes = new HashMap<String,Object>(vc.getAttributes());
attributes.put("CHR",childHomozygousRegionCounter);
attributes.put("CHRS",regionSize);
attributes.put("CHRNCALL",callsWithinHomozygousRegion);
attributes.put("CHRNOPHOM",bufCon.size());
VariantContext newVC = new VariantContext(vc.getName(),vc.getLocation(),vc.getAlleles(),vc.getGenotypes(),vc.getNegLog10PError(),vc.getFilters(),attributes);
writer.addRecord(VariantContextAdaptors.toVCF(newVC,vc.getReference().getBases()[0]));
}
childHomozygousRegionCounter++;
homozygousRegionStartChild = null;
callsWithinHomozygousRegion = 0;
bufCon.clear();
}
public void onTraversalDone(VCFWriter w) {
outputBufferedRecords(contextBuffer,null,w);
}
}