Fix for GSA-589: SelectVariants with -number gives biased results. The implementation was not good and it's not worth keeping this busted code around given that we have a working implementation of a fractional random sampling already in place, so I removed it.

This commit is contained in:
Eric Banks 2012-10-06 22:39:49 -04:00
parent e8a6460a33
commit bfc551f612
2 changed files with 5 additions and 71 deletions

View File

@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*; import java.util.*;
/** /**
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
protected int numRandom = 0;
/** /**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/ */
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private static class RandomVariantStructure {
private VariantContext vc;
RandomVariantStructure(VariantContext vcP) {
vc = vcP;
}
public void set (VariantContext vcP) {
vc = vcP;
}
}
public enum NumberAlleleRestriction { public enum NumberAlleleRestriction {
ALL, ALL,
BIALLELIC, BIALLELIC,
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
/* variables used by the SELECT RANDOM modules */ /* variables used by the SELECT RANDOM modules */
private boolean SELECT_RANDOM_NUMBER = false;
private boolean SELECT_RANDOM_FRACTION = false; private boolean SELECT_RANDOM_FRACTION = false;
private int variantNumber = 0;
private int nVariantsAdded = 0;
private int positionToAdd = 0;
private RandomVariantStructure [] variantArray;
//Random number generator for the genotypes to remove //Random number generator for the genotypes to remove
private Random randomGenotypes = new Random(); private Random randomGenotypes = new Random();
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
} }
SELECT_RANDOM_NUMBER = numRandom > 0;
if (SELECT_RANDOM_NUMBER) {
logger.info("Selecting " + numRandom + " variants at random from the variant track");
variantArray = new RandomVariantStructure[numRandom];
}
SELECT_RANDOM_FRACTION = fractionRandom > 0; SELECT_RANDOM_FRACTION = fractionRandom > 0;
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
break; break;
} }
} }
if ( !failedJexlMatch ) { if ( !failedJexlMatch &&
if (SELECT_RANDOM_NUMBER) { !justRead &&
randomlyAddVariant(++variantNumber, sub); ( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
} vcfWriter.add(sub);
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
if ( ! justRead )
vcfWriter.add(sub);
}
} }
} }
} }
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
public void onTraversalDone(Integer result) { public void onTraversalDone(Integer result) {
logger.info(result + " records processed."); logger.info(result + " records processed.");
if (SELECT_RANDOM_NUMBER) {
int positionToPrint = positionToAdd;
for (int i=0; i<numRandom; i++) {
vcfWriter.add(variantArray[positionToPrint].vc);
positionToPrint = nextCircularPosition(positionToPrint);
}
}
} }
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( sawDP ) if ( sawDP )
builder.attribute("DP", depth); builder.attribute("DP", depth);
} }
private void randomlyAddVariant(int rank, VariantContext vc) {
if (nVariantsAdded < numRandom)
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
else {
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
double t = (1.0/(rank-numRandom+1));
if ( v < t) {
variantArray[positionToAdd].set(vc);
nVariantsAdded++;
positionToAdd = nextCircularPosition(positionToAdd);
}
}
}
private int nextCircularPosition(int cur) {
if ((cur + 1) == variantArray.length)
return 0;
return cur + 1;
}
} }

View File

@ -573,7 +573,7 @@ public class VariantContextUtils {
} }
// if we have more alternate alleles in the merged VC than in one or more of the // if we have more alternate alleles in the merged VC than in one or more of the
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
for ( final VariantContext vc : VCs ) { for ( final VariantContext vc : VCs ) {
if (vc.alleles.size() == 1) if (vc.alleles.size() == 1)
continue; continue;