Now analyzes filtered SNP like all, novel subsets; support for selecting a single sample to analyze from a multi-sample VCF, support for trivial selection of records with INFO field key/value pair.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2613 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-01-16 20:22:04 +00:00
parent 8ae8e120f8
commit d0af7f6c7b
2 changed files with 79 additions and 20 deletions

View File

@ -29,7 +29,7 @@ public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnal
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
nSNPs += eval == null || eval.isReference()? 0 : 1;
nSNPs += eval == null || eval.isReference() ? 0 : 1;
// TODO -- break the het check out to a different module used only for single samples

View File

@ -7,10 +7,15 @@ import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
@ -60,6 +65,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "sampleName", shortName="sampleName", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
public String sampleName = null;
@Argument(fullName = "vcfInfoSelector", shortName="vcfInfoSelector", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
public String vcfInfoSelector = null;
String analysisFilenameBase = null;
@ -75,7 +82,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
// the types of analysis we support, and the string tags we associate with the enumerated value
enum ANALYSIS_TYPE {
// todo -- differeniate into three classes -- snps at known snp sites, snps not at known snp site but covered by known indel, and novel
ALL_SNPS("all"), SINGLETON_SNPS("singletons"), TWOHIT_SNPS("2plus_hit"), KNOWN_SNPS("known"), SNPS_AT_NON_SNP_SITES("snp_at_known_non_snps"), NOVEL_SNPS("novel");
ALL_SNPS("all"),
SINGLETON_SNPS("singletons"),
TWOHIT_SNPS("2plus_hit"),
KNOWN_SNPS("known"),
SNPS_AT_NON_SNP_SITES("snp_at_known_non_snps"),
NOVEL_SNPS("novel"),
FILTERED_SNPS("filtered");
private final String value;
ANALYSIS_TYPE(String value) { this.value = value;}
@ -84,16 +97,30 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
}
// final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
// ANALYSIS_TYPE.SINGLETON_SNPS,
// ANALYSIS_TYPE.TWOHIT_SNPS,
// ANALYSIS_TYPE.KNOWN_SNPS,
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
// ANALYSIS_TYPE.NOVEL_SNPS};
// final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
// ANALYSIS_TYPE.KNOWN_SNPS,
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
// ANALYSIS_TYPE.NOVEL_SNPS};
final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
ANALYSIS_TYPE.SINGLETON_SNPS,
ANALYSIS_TYPE.TWOHIT_SNPS,
ANALYSIS_TYPE.KNOWN_SNPS,
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
ANALYSIS_TYPE.NOVEL_SNPS};
ANALYSIS_TYPE.NOVEL_SNPS,
ANALYSIS_TYPE.FILTERED_SNPS};
final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
ANALYSIS_TYPE.KNOWN_SNPS,
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
ANALYSIS_TYPE.NOVEL_SNPS};
ANALYSIS_TYPE.NOVEL_SNPS,
ANALYSIS_TYPE.FILTERED_SNPS};
final ANALYSIS_TYPE[] SIMPLE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS};
ANALYSIS_TYPE[] ALL_ANALYSIS_NAMES = null;
@ -246,6 +273,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
// Iterate over each analysis, and update it
Variation eval = (Variation) tracker.lookup("eval", null);
Variation evalForFilter = null;
// ensure that the variation we're looking at is bi-allelic
if ( eval != null && ! eval.isBiallelic() )
@ -255,23 +283,34 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
if (eval.getNegLog10PError() * 10.0 < minConfidenceScore) eval = null;
if ( eval != null && (eval instanceof RodVCF) ) {
if ( ((RodVCF)eval).mCurrentRecord.isFiltered() && ! includeFilteredRecords ) {
//System.out.printf("Rejecting filtered record %s%n", eval);
eval = null; // we are not including filtered records, so set eval to null
}
// } else if ( sampleName != null ) {
// // code to grab a particular sample from a VCF
// Variation evalOld = eval;
// Genotype evalG = ((RodVCF)evalOld).getGenotype(sampleName);
// if ( evalG == null ) throw new StingException("Unexpected null genotype for sample " + sampleName);
// BasicGenotype g = new BasicGenotype(evalG.getLocation(), evalG.getBases(), ref.getBase(), evalG.getNegLog10PError());
// if ( g.isVariant(ref.getBase()) ) {
// eval = g.toVariation(ref.getBase());
if ( sampleName != null ) {
// code to grab a particular sample from a VCF
Variation evalOld = eval;
// System.out.printf("original is %s%n", evalOld);
eval = fakeVCFForSample((RodVCF)eval, ref, sampleName);
if ( eval != null && eval.isSNP() ) {
// System.out.printf("sample is %s%n", eval);
// System.out.printf("Replacing %s with %s%n", evalOld, eval);
// } else {
// eval = null;
// }
// }
// System.out.printf("%n");
} else {
eval = null;
}
}
if ( vcfInfoSelector != null && eval != null ) {
String[] keyValue = vcfInfoSelector.split("=");
Map<String, String> map = ((RodVCF)eval).getRecord().getInfoValues();
if ( map.containsKey(keyValue[0]) && ! Pattern.matches(keyValue[1], map.get(keyValue[0]) ) )
eval = null;
}
if ( eval != null && ((RodVCF)eval).mCurrentRecord.isFiltered() ) {
evalForFilter = eval;
if ( ! includeFilteredRecords ) {
eval = null; // we are not including filtered records, so set eval to null
}
//System.out.printf("Rejecting filtered record %s%n", eval);
}
}
// update stats about all of the SNPs
@ -283,6 +322,11 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
}
if (evalForFilter != null) {
updateAnalysisSet(ANALYSIS_TYPE.FILTERED_SNPS, evalForFilter, tracker, ref.getBase(), context);
}
// are we a population backed call? then update
if (eval instanceof SNPCallFromGenotypes) {
SNPCallFromGenotypes call = (SNPCallFromGenotypes) eval;
@ -296,6 +340,21 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
return 1;
}
private RodVCF fakeVCFForSample(RodVCF eval, ReferenceContext ref, final String sampleName) {
VCFGenotypeRecord genotype = (VCFGenotypeRecord)eval.getGenotype(sampleName);
if ( genotype.getNegLog10PError() > 0 ) {
VCFRecord record = new VCFRecord(ref.getBase(), ref.getLocus(), "GT");
record.setAlternateBases(eval.getRecord().getAlternateAlleles());
record.addGenotypeRecord(genotype);
record.setQual(10*genotype.getNegLog10PError());
record.setFilterString(eval.getFilterString());
record.addInfoFields(eval.getInfoValues());
return new RodVCF("fakeVCFForSample", record, eval.getReader());
} else {
return null;
}
}
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
RODRecordList<ReferenceOrderedDatum> dbsnpList = tracker.getTrackData("dbsnp", null);