git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3775 348d0f76-0448-11de-a6fe-93d51630548a

This commit is contained in:
ebanks 2010-07-13 00:56:12 +00:00
parent de969f7cc7
commit 1bef7dd170
4 changed files with 1 additions and 250 deletions

View File

@ -1,105 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
/**
* filter an indel callset based on given criteria
*/
public class IndelSubsets implements ConcordanceType {
public final static int HOMOPOLYMER_MAX = 20;
// Hint: to disable either of these features, set the value to zero (via command-line arguments);
// then empty files will be output for the "below threshold" set, effectively disabling it
private int homopolymerCutoff = 1;
private int sizeCutoff = 2;
private String[][][][] tags = new String[2][2][2][2];
private String sample1, sample2;
public IndelSubsets() {}
public void initialize(Map<String, String> args, Set<String> samples) {
if ( samples.size() != 2 )
throw new StingException("IndelSubsets concordance test cannot handle anything other than 2 VCF records");
if ( args.get("sizeCutoff") != null )
sizeCutoff = Integer.valueOf(args.get("sizeCutoff"));
if ( args.get("homopolymerCutoff") != null )
homopolymerCutoff = Integer.valueOf(args.get("homopolymerCutoff"));
Iterator<String> iter = samples.iterator();
sample1 = iter.next();
sample2 = iter.next();
for (int i = 0; i < 2; i++) {
String name1 = i == 0 ? sample1 : "no_" + sample1;
for (int j = 0; j < 2; j++) {
String name2 = j == 0 ? sample2 : "no_" + sample2;
for (int k = 0; k < 2; k++) {
String name3 = "size" + (k == 0 ? Integer.toString(sizeCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(sizeCutoff));
for (int m = 0; m < 2; m++) {
String name4 = "homopolymer" + (m == 0 ? Integer.toString(homopolymerCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(homopolymerCutoff));
tags[i][j][k][m] = name1 + "." + name2 + "." + name3 + "." + name4;
}
}
}
}
}
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref) {
VCFGenotypeRecord indel1 = samplesToRecords.get(sample1);
VCFGenotypeRecord indel2 = samplesToRecords.get(sample2);
int set1 = ( indel1 != null && !indel1.isPointGenotype() ? 0 : 1 );
int set2 = ( indel2 != null && !indel2.isPointGenotype() ? 0 : 1 );
// skip this locus if they're both not valid indels
if ( set1 == 1 && set2 == 1 )
return null;
// only deal with a valid indel
VCFRecord indel = ( indel1 != null ? indel1.getRecord() : indel2.getRecord() );
// we only deal with the first allele
int size = ( indel.getAlternateAlleleList().get(0).length() <= sizeCutoff ? 0 : 1 );
int homopol = ( homopolymerRunSize(ref, indel) <= homopolymerCutoff ? 0 : 1 );
return tags[set1][set2][size][homopol];
}
private int homopolymerRunSize(ReferenceContext ref, VCFRecord indel) {
byte[] bases = ref.getBases();
GenomeLoc window = ref.getWindow();
GenomeLoc locus = ref.getLocus();
int refBasePos = (int)(locus.getStart() - window.getStart());
byte indelBase = indel.isDeletion() ? bases[refBasePos+1] : (byte)indel.getAlternateAlleleList().get(0).charAt(0);
int leftRun = 0;
for ( int i = refBasePos; i >= 0; i--) {
if ( bases[i] != indelBase )
break;
leftRun++;
}
indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.getAlternateAlleleList().get(0).length(),bases.length-1)] : (byte)indel.getAlternateAlleleList().get(0).charAt(indel.getAlternateAlleleList().get(0).length()-1);
int rightRun = 0;
for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.getAlternateAlleleList().get(0).length() : 1); i < bases.length; i++) {
if ( bases[i] != indelBase )
break;
rightRun++;
}
//System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun);
return Math.max(leftRun, rightRun);
}
public String getInfoName() { return "IndelSubsets"; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "Indel-related subsets"); }
}

View File

@ -1,46 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broad.tribble.vcf.VCFGenotypeRecord;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import java.util.*;
import java.util.Map.Entry;
/**
* Split up N call sets into their various "Venn diagram" sets.
* Note that to minimize complexity (unlike SimpleVenn), this module does NOT check for discordance
* (i.e. the intersections contain calls that are present in multiple sets, regardless of whether
* they agree on the actual variant).
*/
public class NWayVenn implements ConcordanceType {
public NWayVenn() {}
public void initialize(Map<String, String> args, Set<String> samples) { }
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref) {
if ( samplesToRecords.size() == 0 )
return null;
TreeSet<String> concordantSamples = new TreeSet<String>();
for ( Entry<String, VCFGenotypeRecord> entry : samplesToRecords.entrySet() ) {
if ( !entry.getValue().isNoCall() )
concordantSamples.add(entry.getKey());
}
StringBuffer tag = new StringBuffer();
Iterator<String> iter = concordantSamples.iterator();
while ( iter.hasNext() ) {
tag.append(iter.next());
if ( iter.hasNext() )
tag.append("-");
}
return tag.toString();
}
public String getInfoName() { return "NwayVenn"; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "N-way Venn split"); }
}

View File

@ -1,66 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
/**
* Split up two call sets into their various "Venn diagram" sets
*/
public class SimpleVenn implements ConcordanceType {
private String sample1, sample2;
public SimpleVenn() {}
public void initialize(Map<String, String> args, Set<String> samples) {
if ( samples.size() != 2 )
throw new StingException("SimpleVenn concordance test cannot handle anything other than 2 VCF records");
Iterator<String> iter = samples.iterator();
sample1 = iter.next();
sample2 = iter.next();
}
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref ) {
VCFGenotypeRecord call1 = samplesToRecords.get(sample1);
if ( call1 != null && call1.isNoCall() )
call1 = null;
VCFGenotypeRecord call2 = samplesToRecords.get(sample2);
if ( call2 != null && call2.isNoCall() )
call2 = null;
if ( call1 == null && call2 == null )
return null;
// set 1 only
if ( call2 == null )
return sample1 + "_only";
// set 2 only
else if ( call1 == null )
return sample2 + "_only";
// at this point we know that neither is null, so now we need to test for alternate allele concordance
VCFRecord callV1 = call1.getRecord();
VCFRecord callV2 = call2.getRecord();
// we can't really deal with multi-allelic variants
if ( callV1.isBiallelic() && callV2.isBiallelic() ) {
// intersection (concordant)
if ( callV1.getAlternativeBaseForSNP() == callV2.getAlternativeBaseForSNP() )
return "concordant";
// intersection (discordant)
else
return "discordant";
}
return "concordant";
}
public String getInfoName() { return "Venn"; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "2-way Venn split"); }
}

View File

@ -10,43 +10,11 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
return "-T CallsetConcordance -R " + oneKGLocation + "reference/human_b36_both.fasta -L 1:1-8000 -CO %s";
}
@Test
public void testSimpleVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -CT SimpleVenn", 1,
Arrays.asList("a1970effe9c51923d52af9034e778de4"));
executeTest("testSimpleVenn", spec);
}
@Test
//@Test
public void testSNPConcordance() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("e7a0d52c266ba3c76283111674c7168f"));
executeTest("testSNPConcordance", spec);
}
@Test
public void testNWayVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -B set3,VCF," + validationDataLocation + "CEU.sample.vcf -CT NWayVenn", 1,
Arrays.asList("e65fc811137fca7d6c32125240c7468f"));
executeTest("testNWayVenn", spec);
}
@Test
public void testMulti() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("ddc2507590e28743e9cb4b132cb066e7"));
executeTest("testMulti", spec);
}
@Test
public void testComplex() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF," + validationDataLocation + "complexExample.vcf -B set2,VCF," + validationDataLocation + "complexExample.vcf -CT NWayVenn", 1,
Arrays.asList("250df7bde7a8cf9c7ee7c5704183ea88"));
executeTest("testComplex", spec);
}
}