From 933133ee28c65e4bc24d61d84eaa982480a5bb5d Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 8 Jun 2010 03:56:07 +0000 Subject: [PATCH] Initial commit of the opposite homozygote classifier. Currently does the following, given a trio vcf: + Identifies opposite homozygote sites + Identifies the parent from whom it is expected that a null allele was inherited (or whether it was a putative genotype error; e.g. mom=homref, dad=homref, child=homvar) + Labels each opposite homozygote with its homozygous region in the child (e.g. region 1, region 2) + Labels each opposite homozygote with the size of the homozygous region in which it was found, the number of child homozygotes in the region, and the number of opposite homozygote violations within that region To come: + Classification of sites as likely tri-allelic Note that this is very experimental git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3498 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/OppositeHomozygoteClassifier.java | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/OppositeHomozygoteClassifier.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/OppositeHomozygoteClassifier.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/OppositeHomozygoteClassifier.java new file mode 100644 index 000000000..e92bf71f0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/OppositeHomozygoteClassifier.java @@ -0,0 +1,198 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Jun 7, 2010 + */ +public class OppositeHomozygoteClassifier extends RodWalker { + @Argument(shortName="f",fullName="familyPattern",required=true,doc="Pattern for the family structure (usage: mom+dad=child)") + String familyStr = null; + + private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); + private TrioStructure trioStructure; /* holds sample names of mom dad child */ + private ArrayList contextBuffer; /* holds contexts until they're ready to be printed */ + private GenomeLoc homozygousRegionStartChild; /* start of the homozygous region for child */ + private int callsWithinHomozygousRegion; /* number of calls in the current homozygous child region */ + private int childHomozygousRegionCounter; /* holds number of child homozygous regions */ + + public void initialize() { + trioStructure = parseTrioDescription(familyStr); + homozygousRegionStartChild = null; + childHomozygousRegionCounter = 0; + callsWithinHomozygousRegion = 0; + contextBuffer = new ArrayList(500); + } + + public static class TrioStructure { + public String mom, dad, child; + } + + public static TrioStructure parseTrioDescription(String family) { + Matcher m = FAMILY_PATTERN.matcher(family); + if (m.matches()) { + TrioStructure trio = new TrioStructure(); + //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); + trio.mom = m.group(1); + trio.dad = m.group(2); + trio.child = m.group(3); + return trio; + } else { + throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); + } + } + + public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) { + return null; + } + + VariantContext trioVariants = tracker.getVariantContext(ref,"trio", EnumSet.allOf(VariantContext.Type.class), ref.getLocus(), true); + // for this to work we need mismatching parents, one a homozyote, and a child homozygote + + if ( trioVariants == null ) { + return null; + } + + //System.out.println(": "+trioVariants.getGenotype(trioStructure.mom)+" dad: "+trioVariants.getGenotype(trioStructure.dad)+" child: "+trioVariants.getGenotype(trioStructure.child)); + if ( isOppositeHomozygoteSite(trioVariants) && ! trioVariants.isFiltered()) { + // find out who the homozygote is in the parents + if ( trioVariants.getGenotype(trioStructure.mom).isHom() ) { + return assessVariant(trioStructure.mom,trioStructure.dad,trioStructure.child,trioVariants,ref,context); + } else if ( trioVariants.getGenotype(trioStructure.dad).isHom() ) { + return assessVariant(trioStructure.dad,trioStructure.mom,trioStructure.child,trioVariants,ref,context); + } + } + + return trioVariants; + } + + private boolean isOppositeHomozygoteSite(VariantContext trio) { + if ( trio.getGenotype(trioStructure.child).isHet() ) { // not valid at child het sites + return false; + } else if ( trio.getHetCount() > 1 ) { // child is not het, so if this is 2, mom and dad are both het, invalid + return false; + } else if ( trio.getGenotype(trioStructure.dad) == null || trio.getGenotype(trioStructure.mom) == null ) { + return false; + } + + return true; + } + + private VariantContext assessVariant(String homParent, String otherParent, String child, VariantContext variCon, ReferenceContext refCon, AlignmentContext aliCon) { + // see if the child matches the hom parent + HashMap attributes = new HashMap(variCon.getAttributes()); + //System.out.println(refCon.getLocus()+" homParent: "+variCon.getGenotype(homParent).getGenotypeString()+" otherParent: "+variCon.getGenotype(otherParent).getGenotypeString()+" child: "+variCon.getGenotype(child).getGenotypeString()); + //do child and hom parent NOT match genotypes? + if ( variCon.getGenotype(child).isHomRef() && variCon.getGenotype(homParent).isHomVar() || + variCon.getGenotype(child).isHomVar() && variCon.getGenotype(homParent).isHomRef() ) { + // check for genotyping error (other must be het, or opposite of first parent) + if ( variCon.getGenotype(otherParent).isHet() || variCon.getGenotype(otherParent).isHomRef() != variCon.getGenotype(homParent).isHomRef() ) { + attributes.put("opHom",homParent); + } else { + attributes.put("opHom","genotypeError"); + } + } else if ( variCon.getGenotype(otherParent).isHom() && variCon.getGenotype(otherParent).isHomRef() != variCon.getGenotype(homParent).isHomRef() ) { + // is other parent both homozygous and different? + attributes.put("opHom",otherParent); + } + // todo -- assessment of site based on alignment contest (tri allelic? etc) + return new VariantContext(variCon.getName(), variCon.getLocation(), variCon.getAlleles(), variCon.getGenotypes(), variCon.getNegLog10PError(), variCon.getFilters(), attributes); + } + + public VCFWriter reduceInit() { + VCFWriter writer = new VCFWriter(out); + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "OppositeHomozygoteClassifier")); + hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); + //hInfo.add(new VCFHeaderLine("opHom","Child tentatively inheritied the NULL ALLELE from this parent")); + // todo -- add info field annotation lines: "opHom", "CHR", "CHRS" + VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())); + writer.writeHeader(vcfHeader); + + return writer; + } + + public VCFWriter reduce(VariantContext variCon, VCFWriter writer) { + if ( variCon == null ) { + return writer; + } + + if ( homozygosityRegionIsBroken(variCon) && contextBuffer.size() > 0 ) { + outputBufferedRecords(contextBuffer,variCon,writer); + } else if ( ! variCon.isFiltered() && ! variCon.getGenotype(trioStructure.child).isNoCall() ) { + callsWithinHomozygousRegion++; + } + + if ( variCon.hasAttribute("opHom") ) { + //writer.addRecord(VariantContextAdaptors.toVCF(variCon, variCon.getReference().getBases()[0])); + contextBuffer.add(variCon); + } + + if ( homozygousRegionStartChild == null ) { + if ( variCon.getGenotype(trioStructure.child).isHom() && ! variCon.isFiltered() ) { + homozygousRegionStartChild = variCon.getLocation(); + } + } + + return writer; + + } + + private boolean homozygosityRegionIsBroken(VariantContext context) { + // check to see if either the parent or child homozygosity regions have been broken + if ( homozygousRegionStartChild != null && context.getGenotype(trioStructure.child).isHet() && ! context.isFiltered() ) { + // NOTE: NO CALLS DO NOT BREAK REGIONS OF HOMOZYGOSITY + return true; + } + + return false; + } + + private void outputBufferedRecords(List bufCon, VariantContext varCon, VCFWriter writer) { + // the buffered contexts all share one feature -- come from the same child homozygosity region + String regionSize; + if ( varCon != null ) { + regionSize = Integer.toString(varCon.getLocation().distance(homozygousRegionStartChild)); + } else { + regionSize = "unknown"; + } + for ( VariantContext vc : bufCon ) { + HashMap attributes = new HashMap(vc.getAttributes()); + attributes.put("CHR",childHomozygousRegionCounter); + attributes.put("CHRS",regionSize); + attributes.put("CHRNCALL",callsWithinHomozygousRegion); + attributes.put("CHRNOPHOM",bufCon.size()); + VariantContext newVC = new VariantContext(vc.getName(),vc.getLocation(),vc.getAlleles(),vc.getGenotypes(),vc.getNegLog10PError(),vc.getFilters(),attributes); + writer.addRecord(VariantContextAdaptors.toVCF(newVC,vc.getReference().getBases()[0])); + } + childHomozygousRegionCounter++; + homozygousRegionStartChild = null; + callsWithinHomozygousRegion = 0; + bufCon.clear(); + } + + public void onTraversalDone(VCFWriter w) { + outputBufferedRecords(contextBuffer,null,w); + } +}