Private feature to input a list of family descriptions from a file and to look for MV's on all of these. Feature can also output a detailed description of the violation into a separate file

This commit is contained in:
Guillermo del Angel 2011-07-11 14:17:59 -04:00
parent 4d565b0811
commit d587856f2d
1 changed files with 51 additions and 7 deletions

View File

@ -48,6 +48,7 @@ import org.apache.log4j.Logger;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.lang.annotation.AnnotationFormatError; import java.lang.annotation.AnnotationFormatError;
import java.util.*; import java.util.*;
@ -100,6 +101,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false) @Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false)
private File AF_FILE = new File(""); private File AF_FILE = new File("");
@Hidden
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private File FAMILY_STRUCTURE_FILE = null;
@Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) @Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = ""; private String FAMILY_STRUCTURE = "";
@ -121,6 +126,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false) @Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false)
private boolean SELECT_INDELS = false; private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String outMVFile = null;
/* Private class used to store the intermediate variants in the integer random selection process */ /* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure { private class RandomVariantStructure {
@ -148,7 +156,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean DISCORDANCE_ONLY = false; private boolean DISCORDANCE_ONLY = false;
private boolean CONCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false;
private MendelianViolation mv; private Set<MendelianViolation> mvSet = new HashSet<MendelianViolation>();
/* default name for the variant dataset (VCF) */ /* default name for the variant dataset (VCF) */
private final String variantRodName = "variant"; private final String variantRodName = "variant";
@ -169,6 +177,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
double bkDelta = 0.0; double bkDelta = 0.0;
private PrintStream outMVFileStream = null;
/** /**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
@ -224,10 +234,29 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
CONCORDANCE_ONLY = concordanceRodName.length() > 0; CONCORDANCE_ONLY = concordanceRodName.length() > 0;
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName); if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName);
if (MENDELIAN_VIOLATIONS) if (MENDELIAN_VIOLATIONS) {
mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD); if ( FAMILY_STRUCTURE_FILE != null) {
try {
for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) {
MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom()))
mvSet.add(mv);
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
}
if (outMVFile != null)
try {
outMVFileStream = new PrintStream(outMVFile);
}
catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); }
}
else
mvSet.add(new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD));
}
else if (!FAMILY_STRUCTURE.isEmpty()) { else if (!FAMILY_STRUCTURE.isEmpty()) {
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD); mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
MENDELIAN_VIOLATIONS = true; MENDELIAN_VIOLATIONS = true;
} }
@ -289,9 +318,24 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
for (VariantContext vc : vcs) { for (VariantContext vc : vcs) {
if (MENDELIAN_VIOLATIONS) { if (MENDELIAN_VIOLATIONS) {
if (!mv.isViolation(vc)) { boolean foundMV = false;
break; for (MendelianViolation mv : mvSet) {
if (mv.isViolation(vc)) {
foundMV = true;
//System.out.println(vc.toString());
if (outMVFile != null)
outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " +
"childG=%s childGL=%s\n",vc.getChr(), vc.getStart(),
vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)),
mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(),
vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() );
}
} }
if (!foundMV)
break;
} }
if (DISCORDANCE_ONLY) { if (DISCORDANCE_ONLY) {
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false); Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false);
@ -329,7 +373,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) {
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
// We then pick a variant with probablity AF*desiredFraction // We then pick a variant with probablity AF*desiredFraction
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
double af; double af;