Now analyzes filtered SNP like all, novel subsets; support for selecting a single sample to analyze from a multi-sample VCF, support for trivial selection of records with INFO field key/value pair.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2613 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8ae8e120f8
commit
d0af7f6c7b
|
|
@ -29,7 +29,7 @@ public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnal
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||||
nSNPs += eval == null || eval.isReference()? 0 : 1;
|
nSNPs += eval == null || eval.isReference() ? 0 : 1;
|
||||||
|
|
||||||
// TODO -- break the het check out to a different module used only for single samples
|
// TODO -- break the het check out to a different module used only for single samples
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,15 @@ import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||||
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
|
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
@ -60,6 +65,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
@Argument(fullName = "sampleName", shortName="sampleName", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
|
@Argument(fullName = "sampleName", shortName="sampleName", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
|
||||||
public String sampleName = null;
|
public String sampleName = null;
|
||||||
|
@Argument(fullName = "vcfInfoSelector", shortName="vcfInfoSelector", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
|
||||||
|
public String vcfInfoSelector = null;
|
||||||
|
|
||||||
String analysisFilenameBase = null;
|
String analysisFilenameBase = null;
|
||||||
|
|
||||||
|
|
@ -75,7 +82,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
// the types of analysis we support, and the string tags we associate with the enumerated value
|
// the types of analysis we support, and the string tags we associate with the enumerated value
|
||||||
enum ANALYSIS_TYPE {
|
enum ANALYSIS_TYPE {
|
||||||
// todo -- differeniate into three classes -- snps at known snp sites, snps not at known snp site but covered by known indel, and novel
|
// todo -- differeniate into three classes -- snps at known snp sites, snps not at known snp site but covered by known indel, and novel
|
||||||
ALL_SNPS("all"), SINGLETON_SNPS("singletons"), TWOHIT_SNPS("2plus_hit"), KNOWN_SNPS("known"), SNPS_AT_NON_SNP_SITES("snp_at_known_non_snps"), NOVEL_SNPS("novel");
|
ALL_SNPS("all"),
|
||||||
|
SINGLETON_SNPS("singletons"),
|
||||||
|
TWOHIT_SNPS("2plus_hit"),
|
||||||
|
KNOWN_SNPS("known"),
|
||||||
|
SNPS_AT_NON_SNP_SITES("snp_at_known_non_snps"),
|
||||||
|
NOVEL_SNPS("novel"),
|
||||||
|
FILTERED_SNPS("filtered");
|
||||||
|
|
||||||
private final String value;
|
private final String value;
|
||||||
ANALYSIS_TYPE(String value) { this.value = value;}
|
ANALYSIS_TYPE(String value) { this.value = value;}
|
||||||
|
|
@ -84,16 +97,30 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||||
|
// ANALYSIS_TYPE.SINGLETON_SNPS,
|
||||||
|
// ANALYSIS_TYPE.TWOHIT_SNPS,
|
||||||
|
// ANALYSIS_TYPE.KNOWN_SNPS,
|
||||||
|
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||||
|
// ANALYSIS_TYPE.NOVEL_SNPS};
|
||||||
|
// final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||||
|
// ANALYSIS_TYPE.KNOWN_SNPS,
|
||||||
|
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||||
|
// ANALYSIS_TYPE.NOVEL_SNPS};
|
||||||
|
|
||||||
final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||||
ANALYSIS_TYPE.SINGLETON_SNPS,
|
ANALYSIS_TYPE.SINGLETON_SNPS,
|
||||||
ANALYSIS_TYPE.TWOHIT_SNPS,
|
ANALYSIS_TYPE.TWOHIT_SNPS,
|
||||||
ANALYSIS_TYPE.KNOWN_SNPS,
|
ANALYSIS_TYPE.KNOWN_SNPS,
|
||||||
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||||
ANALYSIS_TYPE.NOVEL_SNPS};
|
ANALYSIS_TYPE.NOVEL_SNPS,
|
||||||
|
ANALYSIS_TYPE.FILTERED_SNPS};
|
||||||
final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||||
ANALYSIS_TYPE.KNOWN_SNPS,
|
ANALYSIS_TYPE.KNOWN_SNPS,
|
||||||
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||||
ANALYSIS_TYPE.NOVEL_SNPS};
|
ANALYSIS_TYPE.NOVEL_SNPS,
|
||||||
|
ANALYSIS_TYPE.FILTERED_SNPS};
|
||||||
|
|
||||||
final ANALYSIS_TYPE[] SIMPLE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS};
|
final ANALYSIS_TYPE[] SIMPLE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS};
|
||||||
ANALYSIS_TYPE[] ALL_ANALYSIS_NAMES = null;
|
ANALYSIS_TYPE[] ALL_ANALYSIS_NAMES = null;
|
||||||
|
|
||||||
|
|
@ -246,6 +273,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
// Iterate over each analysis, and update it
|
// Iterate over each analysis, and update it
|
||||||
Variation eval = (Variation) tracker.lookup("eval", null);
|
Variation eval = (Variation) tracker.lookup("eval", null);
|
||||||
|
Variation evalForFilter = null;
|
||||||
|
|
||||||
// ensure that the variation we're looking at is bi-allelic
|
// ensure that the variation we're looking at is bi-allelic
|
||||||
if ( eval != null && ! eval.isBiallelic() )
|
if ( eval != null && ! eval.isBiallelic() )
|
||||||
|
|
@ -255,23 +283,34 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
if (eval.getNegLog10PError() * 10.0 < minConfidenceScore) eval = null;
|
if (eval.getNegLog10PError() * 10.0 < minConfidenceScore) eval = null;
|
||||||
|
|
||||||
if ( eval != null && (eval instanceof RodVCF) ) {
|
if ( eval != null && (eval instanceof RodVCF) ) {
|
||||||
if ( ((RodVCF)eval).mCurrentRecord.isFiltered() && ! includeFilteredRecords ) {
|
if ( sampleName != null ) {
|
||||||
//System.out.printf("Rejecting filtered record %s%n", eval);
|
// code to grab a particular sample from a VCF
|
||||||
eval = null; // we are not including filtered records, so set eval to null
|
Variation evalOld = eval;
|
||||||
}
|
// System.out.printf("original is %s%n", evalOld);
|
||||||
// } else if ( sampleName != null ) {
|
eval = fakeVCFForSample((RodVCF)eval, ref, sampleName);
|
||||||
// // code to grab a particular sample from a VCF
|
if ( eval != null && eval.isSNP() ) {
|
||||||
// Variation evalOld = eval;
|
// System.out.printf("sample is %s%n", eval);
|
||||||
// Genotype evalG = ((RodVCF)evalOld).getGenotype(sampleName);
|
|
||||||
// if ( evalG == null ) throw new StingException("Unexpected null genotype for sample " + sampleName);
|
|
||||||
// BasicGenotype g = new BasicGenotype(evalG.getLocation(), evalG.getBases(), ref.getBase(), evalG.getNegLog10PError());
|
|
||||||
// if ( g.isVariant(ref.getBase()) ) {
|
|
||||||
// eval = g.toVariation(ref.getBase());
|
|
||||||
// System.out.printf("Replacing %s with %s%n", evalOld, eval);
|
// System.out.printf("Replacing %s with %s%n", evalOld, eval);
|
||||||
// } else {
|
// System.out.printf("%n");
|
||||||
// eval = null;
|
} else {
|
||||||
// }
|
eval = null;
|
||||||
// }
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( vcfInfoSelector != null && eval != null ) {
|
||||||
|
String[] keyValue = vcfInfoSelector.split("=");
|
||||||
|
Map<String, String> map = ((RodVCF)eval).getRecord().getInfoValues();
|
||||||
|
if ( map.containsKey(keyValue[0]) && ! Pattern.matches(keyValue[1], map.get(keyValue[0]) ) )
|
||||||
|
eval = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( eval != null && ((RodVCF)eval).mCurrentRecord.isFiltered() ) {
|
||||||
|
evalForFilter = eval;
|
||||||
|
if ( ! includeFilteredRecords ) {
|
||||||
|
eval = null; // we are not including filtered records, so set eval to null
|
||||||
|
}
|
||||||
|
//System.out.printf("Rejecting filtered record %s%n", eval);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// update stats about all of the SNPs
|
// update stats about all of the SNPs
|
||||||
|
|
@ -283,6 +322,11 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
|
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (evalForFilter != null) {
|
||||||
|
updateAnalysisSet(ANALYSIS_TYPE.FILTERED_SNPS, evalForFilter, tracker, ref.getBase(), context);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// are we a population backed call? then update
|
// are we a population backed call? then update
|
||||||
if (eval instanceof SNPCallFromGenotypes) {
|
if (eval instanceof SNPCallFromGenotypes) {
|
||||||
SNPCallFromGenotypes call = (SNPCallFromGenotypes) eval;
|
SNPCallFromGenotypes call = (SNPCallFromGenotypes) eval;
|
||||||
|
|
@ -296,6 +340,21 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private RodVCF fakeVCFForSample(RodVCF eval, ReferenceContext ref, final String sampleName) {
|
||||||
|
VCFGenotypeRecord genotype = (VCFGenotypeRecord)eval.getGenotype(sampleName);
|
||||||
|
if ( genotype.getNegLog10PError() > 0 ) {
|
||||||
|
VCFRecord record = new VCFRecord(ref.getBase(), ref.getLocus(), "GT");
|
||||||
|
record.setAlternateBases(eval.getRecord().getAlternateAlleles());
|
||||||
|
record.addGenotypeRecord(genotype);
|
||||||
|
record.setQual(10*genotype.getNegLog10PError());
|
||||||
|
record.setFilterString(eval.getFilterString());
|
||||||
|
record.addInfoFields(eval.getInfoValues());
|
||||||
|
return new RodVCF("fakeVCFForSample", record, eval.getReader());
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
|
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
|
||||||
RODRecordList<ReferenceOrderedDatum> dbsnpList = tracker.getTrackData("dbsnp", null);
|
RODRecordList<ReferenceOrderedDatum> dbsnpList = tracker.getTrackData("dbsnp", null);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue