Fix for GSA-589: SelectVariants with -number gives biased results. The implementation was not good and it's not worth keeping this busted code around given that we have a working implementation of a fractional random sampling already in place, so I removed it.

This commit is contained in:
Eric Banks 2012-10-06 22:39:49 -04:00
parent e8a6460a33
commit bfc551f612
2 changed files with 5 additions and 71 deletions

View File

@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
protected int numRandom = 0;
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private static class RandomVariantStructure {
private VariantContext vc;
RandomVariantStructure(VariantContext vcP) {
vc = vcP;
}
public void set (VariantContext vcP) {
vc = vcP;
}
}
public enum NumberAlleleRestriction {
ALL,
BIALLELIC,
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
/* variables used by the SELECT RANDOM modules */
private boolean SELECT_RANDOM_NUMBER = false;
private boolean SELECT_RANDOM_FRACTION = false;
private int variantNumber = 0;
private int nVariantsAdded = 0;
private int positionToAdd = 0;
private RandomVariantStructure [] variantArray;
//Random number generator for the genotypes to remove
private Random randomGenotypes = new Random();
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
}
SELECT_RANDOM_NUMBER = numRandom > 0;
if (SELECT_RANDOM_NUMBER) {
logger.info("Selecting " + numRandom + " variants at random from the variant track");
variantArray = new RandomVariantStructure[numRandom];
}
SELECT_RANDOM_FRACTION = fractionRandom > 0;
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
break;
}
}
if ( !failedJexlMatch ) {
if (SELECT_RANDOM_NUMBER) {
randomlyAddVariant(++variantNumber, sub);
}
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
if ( ! justRead )
vcfWriter.add(sub);
}
if ( !failedJexlMatch &&
!justRead &&
( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
vcfWriter.add(sub);
}
}
}
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
public void onTraversalDone(Integer result) {
logger.info(result + " records processed.");
if (SELECT_RANDOM_NUMBER) {
int positionToPrint = positionToAdd;
for (int i=0; i<numRandom; i++) {
vcfWriter.add(variantArray[positionToPrint].vc);
positionToPrint = nextCircularPosition(positionToPrint);
}
}
}
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( sawDP )
builder.attribute("DP", depth);
}
private void randomlyAddVariant(int rank, VariantContext vc) {
if (nVariantsAdded < numRandom)
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
else {
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
double t = (1.0/(rank-numRandom+1));
if ( v < t) {
variantArray[positionToAdd].set(vc);
nVariantsAdded++;
positionToAdd = nextCircularPosition(positionToAdd);
}
}
}
private int nextCircularPosition(int cur) {
if ((cur + 1) == variantArray.length)
return 0;
return cur + 1;
}
}

View File

@ -573,7 +573,7 @@ public class VariantContextUtils {
}
// if we have more alternate alleles in the merged VC than in one or more of the
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
for ( final VariantContext vc : VCs ) {
if (vc.alleles.size() == 1)
continue;