git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3775 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
de969f7cc7
commit
1bef7dd170
|
|
@ -1,105 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.concordance;
|
||||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* filter an indel callset based on given criteria
|
||||
*/
|
||||
public class IndelSubsets implements ConcordanceType {
|
||||
|
||||
public final static int HOMOPOLYMER_MAX = 20;
|
||||
|
||||
// Hint: to disable either of these features, set the value to zero (via command-line arguments);
|
||||
// then empty files will be output for the "below threshold" set, effectively disabling it
|
||||
private int homopolymerCutoff = 1;
|
||||
private int sizeCutoff = 2;
|
||||
|
||||
private String[][][][] tags = new String[2][2][2][2];
|
||||
private String sample1, sample2;
|
||||
|
||||
public IndelSubsets() {}
|
||||
|
||||
public void initialize(Map<String, String> args, Set<String> samples) {
|
||||
if ( samples.size() != 2 )
|
||||
throw new StingException("IndelSubsets concordance test cannot handle anything other than 2 VCF records");
|
||||
|
||||
if ( args.get("sizeCutoff") != null )
|
||||
sizeCutoff = Integer.valueOf(args.get("sizeCutoff"));
|
||||
if ( args.get("homopolymerCutoff") != null )
|
||||
homopolymerCutoff = Integer.valueOf(args.get("homopolymerCutoff"));
|
||||
|
||||
Iterator<String> iter = samples.iterator();
|
||||
sample1 = iter.next();
|
||||
sample2 = iter.next();
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
String name1 = i == 0 ? sample1 : "no_" + sample1;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
String name2 = j == 0 ? sample2 : "no_" + sample2;
|
||||
for (int k = 0; k < 2; k++) {
|
||||
String name3 = "size" + (k == 0 ? Integer.toString(sizeCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(sizeCutoff));
|
||||
for (int m = 0; m < 2; m++) {
|
||||
String name4 = "homopolymer" + (m == 0 ? Integer.toString(homopolymerCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(homopolymerCutoff));
|
||||
tags[i][j][k][m] = name1 + "." + name2 + "." + name3 + "." + name4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref) {
|
||||
|
||||
VCFGenotypeRecord indel1 = samplesToRecords.get(sample1);
|
||||
VCFGenotypeRecord indel2 = samplesToRecords.get(sample2);
|
||||
|
||||
int set1 = ( indel1 != null && !indel1.isPointGenotype() ? 0 : 1 );
|
||||
int set2 = ( indel2 != null && !indel2.isPointGenotype() ? 0 : 1 );
|
||||
|
||||
// skip this locus if they're both not valid indels
|
||||
if ( set1 == 1 && set2 == 1 )
|
||||
return null;
|
||||
|
||||
// only deal with a valid indel
|
||||
VCFRecord indel = ( indel1 != null ? indel1.getRecord() : indel2.getRecord() );
|
||||
|
||||
// we only deal with the first allele
|
||||
int size = ( indel.getAlternateAlleleList().get(0).length() <= sizeCutoff ? 0 : 1 );
|
||||
int homopol = ( homopolymerRunSize(ref, indel) <= homopolymerCutoff ? 0 : 1 );
|
||||
|
||||
return tags[set1][set2][size][homopol];
|
||||
}
|
||||
|
||||
private int homopolymerRunSize(ReferenceContext ref, VCFRecord indel) {
|
||||
byte[] bases = ref.getBases();
|
||||
GenomeLoc window = ref.getWindow();
|
||||
GenomeLoc locus = ref.getLocus();
|
||||
|
||||
int refBasePos = (int)(locus.getStart() - window.getStart());
|
||||
byte indelBase = indel.isDeletion() ? bases[refBasePos+1] : (byte)indel.getAlternateAlleleList().get(0).charAt(0);
|
||||
int leftRun = 0;
|
||||
for ( int i = refBasePos; i >= 0; i--) {
|
||||
if ( bases[i] != indelBase )
|
||||
break;
|
||||
leftRun++;
|
||||
}
|
||||
|
||||
indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.getAlternateAlleleList().get(0).length(),bases.length-1)] : (byte)indel.getAlternateAlleleList().get(0).charAt(indel.getAlternateAlleleList().get(0).length()-1);
|
||||
int rightRun = 0;
|
||||
for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.getAlternateAlleleList().get(0).length() : 1); i < bases.length; i++) {
|
||||
if ( bases[i] != indelBase )
|
||||
break;
|
||||
rightRun++;
|
||||
}
|
||||
|
||||
//System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun);
|
||||
return Math.max(leftRun, rightRun);
|
||||
}
|
||||
|
||||
public String getInfoName() { return "IndelSubsets"; }
|
||||
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "Indel-related subsets"); }
|
||||
}
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.concordance;
|
||||
|
||||
import org.broad.tribble.vcf.VCFGenotypeRecord;
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* Split up N call sets into their various "Venn diagram" sets.
|
||||
* Note that to minimize complexity (unlike SimpleVenn), this module does NOT check for discordance
|
||||
* (i.e. the intersections contain calls that are present in multiple sets, regardless of whether
|
||||
* they agree on the actual variant).
|
||||
*/
|
||||
public class NWayVenn implements ConcordanceType {
|
||||
|
||||
public NWayVenn() {}
|
||||
|
||||
public void initialize(Map<String, String> args, Set<String> samples) { }
|
||||
|
||||
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref) {
|
||||
if ( samplesToRecords.size() == 0 )
|
||||
return null;
|
||||
|
||||
TreeSet<String> concordantSamples = new TreeSet<String>();
|
||||
for ( Entry<String, VCFGenotypeRecord> entry : samplesToRecords.entrySet() ) {
|
||||
if ( !entry.getValue().isNoCall() )
|
||||
concordantSamples.add(entry.getKey());
|
||||
}
|
||||
|
||||
StringBuffer tag = new StringBuffer();
|
||||
Iterator<String> iter = concordantSamples.iterator();
|
||||
while ( iter.hasNext() ) {
|
||||
tag.append(iter.next());
|
||||
if ( iter.hasNext() )
|
||||
tag.append("-");
|
||||
}
|
||||
|
||||
return tag.toString();
|
||||
}
|
||||
|
||||
public String getInfoName() { return "NwayVenn"; }
|
||||
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "N-way Venn split"); }
|
||||
}
|
||||
|
|
@ -1,66 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.concordance;
|
||||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Split up two call sets into their various "Venn diagram" sets
|
||||
*/
|
||||
public class SimpleVenn implements ConcordanceType {
|
||||
|
||||
private String sample1, sample2;
|
||||
|
||||
public SimpleVenn() {}
|
||||
|
||||
public void initialize(Map<String, String> args, Set<String> samples) {
|
||||
if ( samples.size() != 2 )
|
||||
throw new StingException("SimpleVenn concordance test cannot handle anything other than 2 VCF records");
|
||||
|
||||
Iterator<String> iter = samples.iterator();
|
||||
sample1 = iter.next();
|
||||
sample2 = iter.next();
|
||||
}
|
||||
|
||||
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref ) {
|
||||
|
||||
VCFGenotypeRecord call1 = samplesToRecords.get(sample1);
|
||||
if ( call1 != null && call1.isNoCall() )
|
||||
call1 = null;
|
||||
VCFGenotypeRecord call2 = samplesToRecords.get(sample2);
|
||||
if ( call2 != null && call2.isNoCall() )
|
||||
call2 = null;
|
||||
|
||||
if ( call1 == null && call2 == null )
|
||||
return null;
|
||||
|
||||
// set 1 only
|
||||
if ( call2 == null )
|
||||
return sample1 + "_only";
|
||||
|
||||
// set 2 only
|
||||
else if ( call1 == null )
|
||||
return sample2 + "_only";
|
||||
|
||||
// at this point we know that neither is null, so now we need to test for alternate allele concordance
|
||||
VCFRecord callV1 = call1.getRecord();
|
||||
VCFRecord callV2 = call2.getRecord();
|
||||
|
||||
// we can't really deal with multi-allelic variants
|
||||
if ( callV1.isBiallelic() && callV2.isBiallelic() ) {
|
||||
// intersection (concordant)
|
||||
if ( callV1.getAlternativeBaseForSNP() == callV2.getAlternativeBaseForSNP() )
|
||||
return "concordant";
|
||||
// intersection (discordant)
|
||||
else
|
||||
return "discordant";
|
||||
}
|
||||
|
||||
return "concordant";
|
||||
}
|
||||
|
||||
public String getInfoName() { return "Venn"; }
|
||||
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFHeaderLineType.String, "2-way Venn split"); }
|
||||
}
|
||||
|
|
@ -10,43 +10,11 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
return "-T CallsetConcordance -R " + oneKGLocation + "reference/human_b36_both.fasta -L 1:1-8000 -CO %s";
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleVenn() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -CT SimpleVenn", 1,
|
||||
Arrays.asList("a1970effe9c51923d52af9034e778de4"));
|
||||
executeTest("testSimpleVenn", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
//@Test
|
||||
public void testSNPConcordance() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||
Arrays.asList("e7a0d52c266ba3c76283111674c7168f"));
|
||||
executeTest("testSNPConcordance", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNWayVenn() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -B set3,VCF," + validationDataLocation + "CEU.sample.vcf -CT NWayVenn", 1,
|
||||
Arrays.asList("e65fc811137fca7d6c32125240c7468f"));
|
||||
executeTest("testNWayVenn", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMulti() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF," + validationDataLocation + "NA12878.example1.vcf -B set2,VCF," + validationDataLocation + "NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||
Arrays.asList("ddc2507590e28743e9cb4b132cb066e7"));
|
||||
executeTest("testMulti", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComplex() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF," + validationDataLocation + "complexExample.vcf -B set2,VCF," + validationDataLocation + "complexExample.vcf -CT NWayVenn", 1,
|
||||
Arrays.asList("250df7bde7a8cf9c7ee7c5704183ea88"));
|
||||
executeTest("testComplex", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue