1) GATKArgumentCollection has a command to turn off randomization if setting the seed isn't enough. Right now it's only hooked into RankSumTest.
 2) RankSumTest now can be passed a boolean telling it whether to use a dithering or non-randomizing comparator. Unit tested.
 3) VariantsToBinaryPed can now output in both individual-major and SNP-major mode. Integration test.
 4) Updates to PlinkBed-handling python scripts and utilities.
 5) Tool for calculating (LD-corrected) GRMs put under version control. This is analysis for T2D, but I don't want to lose it should something happen to my computer.
This commit is contained in:
Christopher Hartl 2012-10-03 16:02:42 -04:00
parent ac87ed47bb
commit 1be8a88909
4 changed files with 107 additions and 15 deletions

View File

@ -140,6 +140,9 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false; public boolean nonDeterministicRandomSeed = false;
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
public boolean disableRandomization = false;
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// Downsampling Arguments // Downsampling Arguments

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator; package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsC
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -19,10 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/** /**
@ -30,6 +29,7 @@ import java.util.Map;
*/ */
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false; static final boolean DEBUG = false;
private boolean useDithering = true;
public Map<String, Object> annotate(final RefMetaDataTracker tracker, public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker, final AnnotatorCompatible walker,
@ -70,7 +70,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
if (refQuals.isEmpty() && altQuals.isEmpty()) if (refQuals.isEmpty() && altQuals.isEmpty())
return null; return null;
final MannWhitneyU mannWhitneyU = new MannWhitneyU(); final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
for (final Double qual : altQuals) { for (final Double qual : altQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
} }
@ -131,4 +131,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
} }
/**
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
* engine randomization is turned off, and if so does not dither.
* @param walker
* @param toolkit
* @param headerLines
*/
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
useDithering = ! toolkit.getArguments().disableRandomization;
}
} }

View File

@ -66,6 +66,9 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
"(in which case it will be copied to the file you provide as fam output).") "(in which case it will be copied to the file you provide as fam output).")
File metaDataFile; File metaDataFile;
@Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)")
OutputMode mode = OutputMode.INDIVIDUAL_MAJOR;
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
PrintStream outBed; PrintStream outBed;
@ -81,6 +84,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
boolean majorAlleleFirst = false; boolean majorAlleleFirst = false;
enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR }
private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
private static final byte HOM_REF = 0x0; private static final byte HOM_REF = 0x0;
@ -138,14 +143,18 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
throw new UserException("No metadata provided for sample "+sample); throw new UserException("No metadata provided for sample "+sample);
} }
} }
try { if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); // only need to instantiate the files and buffers if in individual major.
printMap.put(sample,new PrintStream(temp)); // Cut down on memory.
tempFiles.put(sample,temp); try {
} catch (IOException e) { File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
throw new ReviewedStingException("Error creating temporary file",e); printMap.put(sample,new PrintStream(temp));
tempFiles.put(sample,temp);
} catch (IOException e) {
throw new ReviewedStingException("Error creating temporary file",e);
}
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
} }
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
famOrder.add(sample); famOrder.add(sample);
} }
} }
@ -195,6 +204,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
// write an entry into the map file // write an entry into the map file
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
refOut,altOut); refOut,altOut);
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
writeIndividualMajor(vc,altMajor);
} else {
writeSNPMajor(vc,altMajor);
}
return 1;
}
public void writeIndividualMajor(VariantContext vc, boolean altMajor) {
// store genotypes per sample into the buffer // store genotypes per sample into the buffer
for ( Genotype g : vc.getGenotypes() ) { for ( Genotype g : vc.getGenotypes() ) {
String sample = g.getSampleName(); String sample = g.getSampleName();
@ -202,6 +222,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
byte enc = getEncoding(g,genotypeCount,altMajor); byte enc = getEncoding(g,genotypeCount,altMajor);
samBuf[byteCount] |= enc; samBuf[byteCount] |= enc;
} }
genotypeCount++; genotypeCount++;
if ( genotypeCount % 4 == 0 ) { if ( genotypeCount % 4 == 0 ) {
byteCount++; byteCount++;
@ -222,8 +243,29 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
} }
genotypeCount = 0; genotypeCount = 0;
} }
}
return 1; public void writeSNPMajor(VariantContext vc, boolean altMajor) {
// for each sample, write the genotype into the bed file, in the
// order of the fam file
genotypeCount = 0;
byteCount = 0;
byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2
for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) {
byte enc = getEncoding(g,genotypeCount,altMajor);
bytes[byteCount] |= enc;
genotypeCount++;
if ( genotypeCount % 4 == 0 ) {
byteCount++;
genotypeCount = 0;
}
}
try {
outBed.write(bytes);
} catch (IOException e) {
throw new ReviewedStingException("Error writing to output bed file",e);
}
} }
public Integer reduce(Integer m, Integer r) { public Integer reduce(Integer m, Integer r) {
@ -236,6 +278,14 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
public void onTraversalDone(Integer numSites) { public void onTraversalDone(Integer numSites) {
logger.info(String.format("%d sites processed!",numSites)); logger.info(String.format("%d sites processed!",numSites));
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
mergeGenotypeTempFiles(numSites);
}
}
private void mergeGenotypeTempFiles(int numSites) {
// push out the remaining genotypes and close stream // push out the remaining genotypes and close stream
for ( String sample : printMap.keySet() ) { for ( String sample : printMap.keySet() ) {
try { try {
@ -278,7 +328,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
throw new ReviewedStingException("Error reading form temp file for input.",e); throw new ReviewedStingException("Error reading form temp file for input.",e);
} }
} }
} }
private byte getEncoding(Genotype g, int offset, boolean altMajor) { private byte getEncoding(Genotype g, int offset, boolean altMajor) {
@ -355,7 +404,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private void writeBedHeader() { private void writeBedHeader() {
// write magic bits into the ped file // write magic bits into the ped file
try { try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)});
// ultimately, the bed will be in individual-major mode // ultimately, the bed will be in individual-major mode
} catch (IOException e) { } catch (IOException e) {
throw new ReviewedStingException("error writing to output file."); throw new ReviewedStingException("error writing to output file.");

View File

@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
} }
public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) {
return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode +
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
" -bim %s -fam %s -bed %s";
}
@Test @Test
public void testNA12878Alone() { public void testNA12878Alone() {
String testName = "testNA12878Alone"; String testName = "testNA12878Alone";
@ -52,6 +59,18 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec); executeTest(testName, spec);
} }
@Test
public void testNA12878AloneSNPMajor() {
String testName = "testNA12878AloneSNPMajor";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446")
);
executeTest(testName, spec);
}
@Test @Test
public void testNA12878HighGQ() { public void testNA12878HighGQ() {
String testName = "testNA12878HighGQ"; String testName = "testNA12878HighGQ";
@ -86,6 +105,16 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
); );
} }
@Test
public void test1000GWithIndelsSNPMajor() {
String testName = "test1000GWithIndelsSNPMajor";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"),
3,
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af")
);
}
@Test @Test
public void test1000G_Symbolic() { public void test1000G_Symbolic() {
String testName = "test1000G_Symbolic"; String testName = "test1000G_Symbolic";