Somewhat major update. Changes:
- ProduceBeagleInputWalker + Now takes a validation ROD and a prior to give it, will use those genotypes in place of the variant genotypes if both are present + Takes a bootstrap argument -- can use some given %age of the validation sites + Optionally takes a bootstrap output argument -- re-prints the validation VCF, filtering those sites used as part of the bootstrap -BeagleOutputToVCFWalker + Now filters sites where the genotypes have been reverted to hom ref + Now calls in to the new VCUtils to calculate AC/AN -Queue + New pipeline libraries for easy qscript creation, still a work in progress, but this is a considerable prototype + full calling pipeline v2 uses the above libraries + minor changes to some of my own scripts + no more need for contig interval lists, these will be parsed out of your normal interval list when it is provided git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4459 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e02f837659
commit
21ec44339d
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -161,6 +162,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
int numGenotypesChangedByBeagle = 0;
|
||||
Integer alleleCountH = 0, chrCountH = 0;
|
||||
Double alleleFrequencyH = 0.0;
|
||||
int beagleVarCounts = 0;
|
||||
|
||||
Map<String,Genotype> hapmapGenotypes = null;
|
||||
|
||||
|
|
@ -284,31 +286,26 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
originalAttributes.put("OG",".");
|
||||
}
|
||||
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||
|
||||
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
|
||||
beagleVarCounts++;
|
||||
}
|
||||
|
||||
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
|
||||
|
||||
}
|
||||
|
||||
VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
|
||||
|
||||
Set<Allele> altAlleles = filteredVC.getAlternateAlleles();
|
||||
StringBuffer altAlleleCountString = new StringBuffer();
|
||||
for ( Allele allele : altAlleles ) {
|
||||
if ( altAlleleCountString.length() > 0 )
|
||||
altAlleleCountString.append(",");
|
||||
altAlleleCountString.append(filteredVC.getChromosomeCount(allele));
|
||||
VariantContext filteredVC;
|
||||
if ( beagleVarCounts > 0)
|
||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
|
||||
else {
|
||||
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
|
||||
removedFilters.add(String.format("BGL_RM_WAS_%s/%s",vc_input.getReference().getBaseString(),vc_input.getAlternateAllele(0)));
|
||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
|
||||
if ( filteredVC.getChromosomeCount() > 0 ) {
|
||||
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount()));
|
||||
if ( altAlleleCountString.length() > 0 ) {
|
||||
attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString());
|
||||
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f",
|
||||
Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount())));
|
||||
}
|
||||
}
|
||||
// re-compute chromosome counts
|
||||
VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false);
|
||||
|
||||
// Get Hapmap AC and AF
|
||||
if (vc_comp != null) {
|
||||
|
|
@ -322,8 +319,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
attributes.put("R2", beagleR2Feature.getR2value().toString() );
|
||||
|
||||
|
||||
|
||||
vcfWriter.add(VariantContext.modifyAttributes(filteredVC, attributes), ref.getBase());
|
||||
vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase());
|
||||
|
||||
|
||||
return 1;
|
||||
|
|
|
|||
|
|
@ -28,13 +28,14 @@ package org.broadinstitute.sting.gatk.walkers.beagle;
|
|||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.vcf.VCFWriter;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
|
|
@ -42,6 +43,8 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -53,6 +56,7 @@ import java.util.*;
|
|||
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
public static final String ROD_NAME = "variant";
|
||||
public static final String VALIDATION_ROD_NAME = "validation";
|
||||
|
||||
@Output(doc="File to which BEAGLE input should be written",required=true)
|
||||
protected PrintStream beagleWriter = null;
|
||||
|
|
@ -61,9 +65,24 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
public PrintStream beagleGenotypesWriter = null;
|
||||
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
|
||||
public double insertedNoCallRate = 0;
|
||||
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
|
||||
public double validationPrior = -1.0;
|
||||
@Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
|
||||
public double bootstrap = 0.0;
|
||||
@Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
|
||||
VCFWriter bootstrapVCFOutput = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false)
|
||||
public double variantPrior = 0.96;
|
||||
|
||||
|
||||
private Set<String> samples = null;
|
||||
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
|
||||
private Random generator;
|
||||
private int bootstrapCount = -1;
|
||||
private int bootstrapSetSize = 0;
|
||||
private int testSetSize = 0;
|
||||
|
||||
public void initialize() {
|
||||
generator = new Random();
|
||||
|
|
@ -77,114 +96,176 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
beagleWriter.println();
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.println("dummy header");
|
||||
|
||||
if ( bootstrapVCFOutput != null ) {
|
||||
initializeVcfWriter();
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker != null ) {
|
||||
if( tracker != null ) {
|
||||
GenomeLoc loc = context.getLocation();
|
||||
VariantContext vc_eval;
|
||||
VariantContext variant_eval;
|
||||
VariantContext validation_eval;
|
||||
|
||||
vc_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
|
||||
if ( vc_eval == null || vc_eval.isFiltered() )
|
||||
variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
|
||||
validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc,false);
|
||||
if ( goodSite(variant_eval,validation_eval) ) {
|
||||
if ( useValidation(variant_eval,validation_eval, ref) ) {
|
||||
writeBeagleOutput(validation_eval,variant_eval,true,validationPrior);
|
||||
} else {
|
||||
if ( goodSite(variant_eval) ) {
|
||||
writeBeagleOutput(variant_eval,validation_eval,false,variantPrior);
|
||||
return 1;
|
||||
} else { // todo -- if the variant site is bad, validation is good, but not in bootstrap set -- what do?
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
|
||||
if (!vc_eval.isSNP())
|
||||
return 0;
|
||||
|
||||
if (!vc_eval.isBiallelic())
|
||||
return 0;
|
||||
|
||||
// output marker ID to Beagle input file
|
||||
beagleWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString()));
|
||||
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString()));
|
||||
|
||||
for (Allele allele: vc_eval.getAlleles()) {
|
||||
// TODO -- check whether this is really needed by Beagle
|
||||
String bglPrintString;
|
||||
if (allele.isNoCall() || allele.isNull())
|
||||
bglPrintString = "0";
|
||||
else
|
||||
bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele
|
||||
|
||||
beagleWriter.print(String.format("%s ", bglPrintString));
|
||||
}
|
||||
|
||||
if ( !vc_eval.hasGenotypes() )
|
||||
return 0;
|
||||
|
||||
Map<String, Genotype> genotypes = vc_eval.getGenotypes();
|
||||
for ( String sample : samples ) {
|
||||
// use sample as key into genotypes structure
|
||||
Genotype genotype = genotypes.get(sample);
|
||||
if (genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
|
||||
String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
|
||||
|
||||
double[] likeArray = new double[glArray.length];
|
||||
|
||||
// convert to double array so we can normalize
|
||||
int k=0;
|
||||
for (String gl : glArray) {
|
||||
likeArray[k++] = Double.valueOf(gl);
|
||||
}
|
||||
|
||||
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
|
||||
// see if we need to randomly mask out genotype in this position.
|
||||
Double d = generator.nextDouble();
|
||||
if (d > insertedNoCallRate ) {
|
||||
// System.out.format("%5.4f ", d);
|
||||
for (Double likeVal: normalizedLikelihoods)
|
||||
beagleWriter.print(String.format("%5.4f ",likeVal));
|
||||
}
|
||||
else {
|
||||
// we are masking out this genotype
|
||||
beagleWriter.print("0.33 0.33 0.33 ");
|
||||
}
|
||||
|
||||
if (beagleGenotypesWriter != null) {
|
||||
char a = genotype.getAllele(0).toString().charAt(0);
|
||||
char b = genotype.getAllele(0).toString().charAt(0);
|
||||
|
||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
||||
}
|
||||
}
|
||||
else if (genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
|
||||
// hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype
|
||||
// is confident. This is useful for Hapmap and 1KG release VCFs.
|
||||
double AA = 0.02;
|
||||
double AB = 0.02;
|
||||
double BB = 0.02;
|
||||
|
||||
if (genotype.isHomRef()) { AA = 0.96; }
|
||||
else if (genotype.isHet()) { AB = 0.96; }
|
||||
else if (genotype.isHomVar()) { BB = 0.96; }
|
||||
|
||||
beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
|
||||
|
||||
if (beagleGenotypesWriter != null) {
|
||||
char a = genotype.getAllele(0).toString().charAt(0);
|
||||
char b = genotype.getAllele(0).toString().charAt(0);
|
||||
|
||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.print(". . ");
|
||||
}
|
||||
}
|
||||
|
||||
beagleWriter.println();
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.println();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
public boolean goodSite(VariantContext a, VariantContext b) {
|
||||
return goodSite(a) || goodSite(b);
|
||||
}
|
||||
|
||||
public boolean goodSite(VariantContext v) {
|
||||
return v != null && ! v.isFiltered() && v.isSNP() && v.isBiallelic() && v.hasGenotypes();
|
||||
}
|
||||
|
||||
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
|
||||
if( goodSite(validation) ) {
|
||||
// if using record keeps us below expected proportion, use it
|
||||
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
|
||||
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
|
||||
if ( bootstrapVCFOutput != null ) {
|
||||
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() );
|
||||
}
|
||||
bootstrapSetSize ++;
|
||||
return true;
|
||||
} else {
|
||||
if ( bootstrapVCFOutput != null ) {
|
||||
bootstrapVCFOutput.add(validation,ref.getBase());
|
||||
}
|
||||
testSetSize++;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if ( validation != null && bootstrapVCFOutput != null ) {
|
||||
bootstrapVCFOutput.add(validation,ref.getBase());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
|
||||
beagleWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
|
||||
if ( beagleGenotypesWriter != null ) {
|
||||
beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
|
||||
}
|
||||
|
||||
for ( Allele allele : preferredVC.getAlleles() ) {
|
||||
String bglPrintString;
|
||||
if (allele.isNoCall() || allele.isNull())
|
||||
bglPrintString = "0";
|
||||
else
|
||||
bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele
|
||||
|
||||
beagleWriter.print(String.format("%s ", bglPrintString));
|
||||
}
|
||||
|
||||
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
|
||||
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||
boolean isValidation;
|
||||
for ( String sample : samples ) {
|
||||
Genotype genotype;
|
||||
// use sample as key into genotypes structure
|
||||
if ( preferredGenotypes.keySet().contains(sample) ) {
|
||||
genotype = preferredGenotypes.get(sample);
|
||||
isValidation = isValidationSite;
|
||||
} else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) {
|
||||
genotype = otherGenotypes.get(sample);
|
||||
isValidation = ! isValidationSite;
|
||||
} else {
|
||||
// there is magically no genotype for this sample.
|
||||
throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
|
||||
}
|
||||
/**
|
||||
* Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
|
||||
*/
|
||||
if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
|
||||
String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
|
||||
|
||||
double[] likeArray = new double[glArray.length];
|
||||
|
||||
// convert to double array so we can normalize
|
||||
int k=0;
|
||||
for (String gl : glArray) {
|
||||
likeArray[k++] = Double.valueOf(gl);
|
||||
}
|
||||
|
||||
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
|
||||
// see if we need to randomly mask out genotype in this position.
|
||||
Double d = generator.nextDouble();
|
||||
if (d > insertedNoCallRate ) {
|
||||
// System.out.format("%5.4f ", d);
|
||||
for (Double likeVal: normalizedLikelihoods)
|
||||
beagleWriter.print(String.format("%5.4f ",likeVal));
|
||||
}
|
||||
else {
|
||||
// we are masking out this genotype
|
||||
beagleWriter.print("0.33 0.33 0.33 ");
|
||||
}
|
||||
|
||||
if (beagleGenotypesWriter != null) {
|
||||
char a = genotype.getAllele(0).toString().charAt(0);
|
||||
char b = genotype.getAllele(0).toString().charAt(0);
|
||||
|
||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* otherwise, use the prior uniformly
|
||||
*/
|
||||
else if (! isValidation && genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
|
||||
// hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype
|
||||
// is confident. This is useful for Hapmap and 1KG release VCFs.
|
||||
double AA = (1.0-prior)/2.0;
|
||||
double AB = (1.0-prior)/2.0;
|
||||
double BB = (1.0-prior)/2.0;
|
||||
|
||||
if (genotype.isHomRef()) { AA = prior; }
|
||||
else if (genotype.isHet()) { AB = prior; }
|
||||
else if (genotype.isHomVar()) { BB = prior; }
|
||||
|
||||
beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
|
||||
|
||||
if (beagleGenotypesWriter != null) {
|
||||
char a = genotype.getAllele(0).toString().charAt(0);
|
||||
char b = genotype.getAllele(0).toString().charAt(0);
|
||||
|
||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.print(". . ");
|
||||
}
|
||||
}
|
||||
|
||||
beagleWriter.println();
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.println();
|
||||
}
|
||||
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0; // Nothing to do here
|
||||
}
|
||||
|
|
@ -195,6 +276,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
public void onTraversalDone( Integer sum ) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void initializeVcfWriter() {
|
||||
|
||||
final ArrayList<String> inputNames = new ArrayList<String>();
|
||||
inputNames.add( VALIDATION_ROD_NAME );
|
||||
|
||||
// setup the header fields
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFFilterHeaderLine("bootstrap","This site used for genotype bootstrapping with ProduceBeagleInputWalker"));
|
||||
|
||||
bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,4 +54,27 @@ public class BeagleIntegrationTest extends WalkerTest {
|
|||
executeTest("test BeagleInput", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBeagleInput2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T ProduceBeagleInput -B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+
|
||||
"-B:validation,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+
|
||||
"-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta -NO_HEADER ",2,
|
||||
Arrays.asList("44d28b6b092d5f4c0ae59af442612ea3","481c58f8309916184a33ab1835e5cc48"));
|
||||
executeTest("test BeagleInputWithBootstrap",spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBeagleOutput2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T BeagleOutputToVCF -R "+hg19Reference+" "+
|
||||
"-B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.vcf "+
|
||||
"-B:beagleR2,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
|
||||
"-B:beagleProbs,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
|
||||
"-B:beaglePhased,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
|
||||
"-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("fd104b7da9e6c99b662c6ad746fd53f3"));
|
||||
|
||||
executeTest("testBeagleChangesSitesToRef",spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
import com.sun.tools.doclets.internal.toolkit.util.DocFinder.Input
|
||||
import org.broadinstitute.sting.gatk.{CommandLineGATK, DownsampleType}
|
||||
import java.io.File
|
||||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.commandline.Argument
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||
import org.broadinstitute.sting.queue.extensions.samtools._
|
||||
import org.broadinstitute.sting.queue.pipeline.{BamProcessing,VariantCalling}
|
||||
import org.broadinstitute.sting.queue.{QException, QScript}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
||||
class fullCallingPipelineV2 extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Argument(doc="Number of cleaning jobs", shortName="cleaningJobs")
|
||||
var cleaningJobs: Int = _
|
||||
|
||||
@Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
|
||||
var yamlFile: File = _
|
||||
|
||||
@Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
|
||||
var trigger: File = _
|
||||
|
||||
@Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
|
||||
var refseqTable: File = _
|
||||
|
||||
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)
|
||||
var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
|
||||
|
||||
@Input(doc="path to GATK jar")
|
||||
var gatkJar: File = _
|
||||
|
||||
@Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
|
||||
var target_titv: Float = _
|
||||
|
||||
@Input(doc="per-sample downsampling level",shortName="dcov",required=false)
|
||||
var downsampling_coverage = 300
|
||||
|
||||
@Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
|
||||
var num_snp_scatter_jobs = 20
|
||||
|
||||
@Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
|
||||
var num_indel_scatter_jobs = 5
|
||||
|
||||
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
|
||||
var skip_cleaning = false
|
||||
|
||||
//@Input(doc="ADPR script")
|
||||
//var adprScript: File = _
|
||||
|
||||
//@Input(doc="Sequencing maching name (for use by adpr)")
|
||||
//var machine: String = _
|
||||
|
||||
//@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
|
||||
//var protocol: String = _
|
||||
|
||||
private var pipeline: Pipeline = _
|
||||
|
||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||
this.intervals = qscript.pipeline.getProject.getIntervalList
|
||||
this.jarFile = qscript.gatkJar
|
||||
this.reference_sequence = qscript.pipeline.getProject.getReferenceFile
|
||||
this.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
|
||||
// ------------ SETUP THE PIPELINE ----------- //
|
||||
|
||||
|
||||
def script = {
|
||||
pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
|
||||
var callingLib: VariantCalling = new VariantCalling(qscript.yamlFile,qscript.gatkJar)
|
||||
var cleaningLib: BamProcessing = new BamProcessing(qscript.yamlFile,qscript.gatkJar,qscript.picardFixMatesJar)
|
||||
|
||||
val projectBase: String = qscript.pipeline.getProject.getName
|
||||
val cleanedBase: String = projectBase + ".cleaned"
|
||||
val uncleanedBase: String = projectBase + ".uncleaned"
|
||||
|
||||
// there are commands that use all the bam files
|
||||
val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
|
||||
|
||||
var bamsToClean: List[(File,File)] = Nil
|
||||
var recalBams: List[File] = Nil
|
||||
var cleanedBams: List[File] = Nil
|
||||
|
||||
for ( sample <- recalibratedSamples ) {
|
||||
val bam = sample.getBamFiles.get("recalibrated")
|
||||
recalBams :+= bam
|
||||
if (!sample.getBamFiles.contains("cleaned")) {
|
||||
sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam"))
|
||||
bamsToClean :+= (bam,sample.getBamFiles.get("cleaned"))
|
||||
}
|
||||
|
||||
cleanedBams :+= sample.getBamFiles.get("cleaned")
|
||||
}
|
||||
|
||||
if ( !qscript.skip_cleaning ) {
|
||||
cleaningLib.StandardIndelRealign(bamsToClean,qscript.cleaningJobs)
|
||||
}
|
||||
|
||||
if (!qscript.skip_cleaning) {
|
||||
endToEnd(cleanedBase, cleanedBams, lib)
|
||||
} else {
|
||||
endToEnd(uncleanedBase, recalBams, lib)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def endToEnd(base: String, bamFiles: List[File], lib: VariantCalling) = {
|
||||
var recal_vcf = base+"_snps.recal.annotated.tranched.vcf"
|
||||
var handfilt_vcf = base+"_snps.handfiltered.annotated.vcf"
|
||||
var indel_vcf = base+"_indel_calls.vcf"
|
||||
|
||||
for ( c <- lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,qscript.target_titv,qscript.refseqTable) ) {
|
||||
add(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
import java.io.{FileReader, File, BufferedReader}
|
||||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||
import org.broadinstitute.sting.queue.extensions.samtools._
|
||||
import org.broadinstitute.sting.queue.{QException, QScript}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
||||
|
||||
class omni_bootstrap_refine extends QScript {
|
||||
=> script
|
||||
|
||||
val ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
val gatkJar = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
||||
|
||||
trait FooTrait extends CommandLineGATK {
|
||||
this.reference_sequence = script.ref
|
||||
this.jarFile = script.gatkJar
|
||||
this.intervalsString :+= "20"
|
||||
}
|
||||
|
||||
class SampleOverlap extends CommandLineFunction {
|
||||
@Input(doc="omni vcf") var omni: File = _
|
||||
@Input(doc="other vcf") var other: File = _
|
||||
@Output(doc="output sn file") var out : File = _
|
||||
|
||||
def commandLine = "%s ; %s ; %s ; %s".format(this.cmd1, this.cmd2, this.cmd3, this.cmd4)
|
||||
|
||||
def cmd1 : String = {
|
||||
return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n' | sort > temp1".format(omni.getAbsolutePath)
|
||||
}
|
||||
|
||||
def cmd2 : String = {
|
||||
return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n'| sort > temp2".format(other.getAbsolutePath)
|
||||
}
|
||||
|
||||
def cmd3 : String = {
|
||||
return "cat temp1 temp2 | sort | uniq -c | awk '{if($1 == 2) print $2}' > %s".format(out.getAbsolutePath)
|
||||
}
|
||||
|
||||
def cmd4 : String = {
|
||||
return "rm temp1 temp2"
|
||||
}
|
||||
}
|
||||
|
||||
class BeagleRefinement extends CommandLineFunction {
|
||||
@Input(doc="The beagle input file") var beagleInput: File = _
|
||||
var beagleOutputBase: String = _
|
||||
var beagleMemoryGigs: Int = 4
|
||||
|
||||
/**
|
||||
* Note: These get set
|
||||
*/
|
||||
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
|
||||
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
|
||||
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
|
||||
var beagleOutputDir: String = _
|
||||
|
||||
def freezeOutputs = {
|
||||
if ( beagleInput.getParent == null ) {
|
||||
beagleOutputDir = ""
|
||||
} else {
|
||||
beagleOutputDir = beagleInput.getParent
|
||||
}
|
||||
beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz")
|
||||
beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
|
||||
beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2")
|
||||
}
|
||||
|
||||
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,beagleOutputBase)
|
||||
}
|
||||
|
||||
def runme(pop: File, omni: File, base: String) : List[CommandLineFunction] = {
|
||||
var clf : List[CommandLineFunction] = Nil
|
||||
val s_overlap = new File(base+"_sample_overlap.txt")
|
||||
var calcOverlap = new SampleOverlap
|
||||
calcOverlap.omni = omni
|
||||
calcOverlap.other = pop
|
||||
calcOverlap.out = s_overlap
|
||||
|
||||
clf += calcOverlap
|
||||
|
||||
val subset_omni = swapExt(omni,".vcf","_%s_subset.vcf".format(base))
|
||||
val subset_pop = swapExt(pop,".vcf","_%s_subset.vcf".format(base))
|
||||
|
||||
var omSubset = new SelectVariants with FooTrait
|
||||
omSubset.variantVCF = omni
|
||||
omSubset.sample = s_overlap.getAbsolutePath
|
||||
omSubset.out = subset_omni
|
||||
|
||||
clf += omSubset
|
||||
|
||||
var popSubset = new SelectVariants with FooTrait
|
||||
popSubset.variantVCF = pop
|
||||
popSubset.sample = s_overlap.getAbsolutePath
|
||||
popSubset.out = subset_pop
|
||||
|
||||
clf += popSubset
|
||||
|
||||
var bootRefine = new ProduceBeagleInput with FooTrait
|
||||
bootRefine.variantVCF = popSubset
|
||||
bootRefine.rodBind :+= new RodBind("validation","VCF",subset_omni)
|
||||
bootRefine.bootstrap_vcf = swapExt(subset_omni,".vcf",".boot.vcf")
|
||||
bootRefine.bootstrap = Some(2)
|
||||
bootRefine.validation_genotype_ptrue = Some(0.98);
|
||||
bootRefine.out = new File(base+".beagle_input.beagle")
|
||||
|
||||
clf += bootRefine
|
||||
|
||||
var runBeagle = new BeagleRefinement
|
||||
runBeagle.beagleInput = bootRefine.out
|
||||
runBeagle.beagleOutputBase = base+"_beagle"
|
||||
runBeagle.freezeOutputs()
|
||||
|
||||
clf += runBeagle
|
||||
|
||||
var putBack = new BeagleOutputToVCF with FooTrait
|
||||
putBack.
|
||||
}
|
||||
}
|
||||
|
|
@ -14,22 +14,28 @@ import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
|||
class omni_qc extends QScript {
|
||||
qscript =>
|
||||
|
||||
var pilot3_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot3_overlap.txt")
|
||||
var pilot1_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot1_overlap.txt")
|
||||
var production_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/production_overlap.txt")
|
||||
var hiseq_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878_name.txt")
|
||||
// NON-OMNI VCF FILES
|
||||
var pilot3_release_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/pilot3/merge_release/ALL.exon.2010_03.genotypes.vcf","vcf")
|
||||
var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
||||
var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
||||
var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
||||
var august_calls_vcf = null
|
||||
var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
|
||||
var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf")
|
||||
|
||||
// OMNI VCF FILES
|
||||
var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf")
|
||||
var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf")
|
||||
var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf")
|
||||
|
||||
// INTERVALS
|
||||
var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list"
|
||||
var pilot1_interval_list: String = _
|
||||
var hiseq_interval_list: String = _
|
||||
var production_interval_list: String = "20"
|
||||
|
||||
val Omni_chip_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.vcf.gz","vcf")
|
||||
val Omni_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.b37.vcf.gz","vcf")
|
||||
val pilot3_vcf_broad = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.exon.2010_05.broad.genotypes.vcf.gz","vcf")
|
||||
val pilot1_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
||||
val production_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.production.2010_08.chr20.lowpass.genotypes.vcf","vcf")
|
||||
val hiSeq_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
|
||||
// OTHER
|
||||
var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt")
|
||||
|
||||
trait OmniArgs extends CommandLineGATK {
|
||||
this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta")
|
||||
|
|
@ -80,6 +86,10 @@ class omni_qc extends QScript {
|
|||
eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval")
|
||||
eval.reportType = Some(VE2TemplateType.R)
|
||||
eval.analysisName = base+"_Eval"
|
||||
eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5")
|
||||
eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+")
|
||||
eval.disI = true
|
||||
eval.outputVCF = outDir+base+".eval.interesting.sites.vcf"
|
||||
|
||||
if ( base.equalsIgnoreCase("production") ) {
|
||||
subset.variantVCF = Omni_b37_vcf
|
||||
|
|
@ -88,6 +98,8 @@ class omni_qc extends QScript {
|
|||
eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
|
||||
}
|
||||
|
||||
|
||||
|
||||
add(subset,makeOmniSiteList,eval)
|
||||
}
|
||||
|
||||
|
|
@ -96,5 +108,6 @@ class omni_qc extends QScript {
|
|||
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3")
|
||||
runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq")
|
||||
runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production")
|
||||
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,4 +38,5 @@ trait QScript extends Logging {
|
|||
* Adds one or more command line functions to be run.
|
||||
*/
|
||||
def add(functions: CommandLineFunction*) = this.functions ++= List(functions:_*)
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,6 +12,9 @@ import org.broadinstitute.sting.queue.function.scattergather.{SimpleTextGatherFu
|
|||
trait CommandLineFunction extends QFunction with Logging {
|
||||
def commandLine: String
|
||||
|
||||
/* Is it a gather function? */
|
||||
var isGather: Boolean = true
|
||||
|
||||
/** Upper memory limit */
|
||||
var memoryLimit: Option[Int] = None
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,138 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
import collection.mutable.{ListBuffer, HashMap}
|
||||
import collection.JavaConversions
|
||||
import java.util.Arrays
|
||||
import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
|
||||
import org.broadinstitute.sting.commandline.{Output, Input}
|
||||
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
|
||||
|
||||
class BamProcessing(yaml: File, gatkJar: File, fixMatesJar: File) {
|
||||
library =>
|
||||
|
||||
var attributes: Pipeline = YamlUtils.load(classOf[Pipeline],yaml)
|
||||
|
||||
trait StandardCommandLineGATK extends CommandLineGATK {
|
||||
this.reference_sequence = library.attributes.getProject.getReferenceFile
|
||||
this.intervals = library.attributes.getProject.getIntervalList
|
||||
this.DBSNP = library.attributes.getProject.getDbsnpFile
|
||||
this.memoryLimit = Some(2)
|
||||
this.jarFile = library.gatkJar
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
|
||||
* @Returns: A CLF for the realigner target creator job
|
||||
*/
|
||||
protected def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
|
||||
var rtc = new RealignerTargetCreator with StandardCommandLineGATK
|
||||
rtc.intervals = null
|
||||
rtc.intervalsString = contigs
|
||||
rtc.input_file :+= bam
|
||||
rtc.out = output
|
||||
rtc.analysisName = "RealignerTargetCreator"
|
||||
|
||||
return rtc
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
|
||||
* @Returns: A CLF for the indel cleaning job
|
||||
*/
|
||||
protected def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
|
||||
var realigner = new IndelRealigner with StandardCommandLineGATK
|
||||
realigner.intervalsString = contigs
|
||||
realigner.intervals = null
|
||||
realigner.input_file :+= bam
|
||||
realigner.out = outBam
|
||||
realigner.targetIntervals = targets
|
||||
realigner.analysisName = "IndelClean"
|
||||
realigner.bam_compression = Some(0)
|
||||
|
||||
return realigner
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
|
||||
* @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
|
||||
*/
|
||||
protected def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
|
||||
var cmds : List[CommandLineFunction] = Nil
|
||||
var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
|
||||
ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
|
||||
)
|
||||
var bamsToMerge : List[File] = Nil
|
||||
for ( spec <- jobSpecs ) {
|
||||
cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
|
||||
bamsToMerge :+= spec._2
|
||||
}
|
||||
|
||||
cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
|
||||
|
||||
return cmds
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
|
||||
* command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
|
||||
* of command line functions
|
||||
* @Returns: A list of command line functions for the full indel realignment pipeline from the collection
|
||||
* of uncleaned bams to the collection of cleaned bams
|
||||
*/
|
||||
protected def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
|
||||
val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
|
||||
var commands : List[CommandLineFunction] = Nil
|
||||
for ( bamPair <- bamsUncleanCleanPairs ) {
|
||||
val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
|
||||
val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
|
||||
val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
|
||||
sam.bamFile = bamPair._2
|
||||
sam.analysisName = "SamtoolsIndex"
|
||||
commands :+= rtc
|
||||
commands ++= icbs
|
||||
commands :+= sam
|
||||
}
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
|
||||
* @Returns: Command line function for the merge, fix-mate, and sort operation
|
||||
*/
|
||||
protected def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
|
||||
var pfm : PicardFixMates = new PicardFixMates
|
||||
pfm.bams = inBams
|
||||
pfm.outBam = outBam
|
||||
pfm.jarFile = picardJar
|
||||
pfm.assumeSorted = Some(false)
|
||||
pfm.memoryLimit = Some(4)
|
||||
pfm.analysisName = "FixMates"
|
||||
|
||||
return pfm
|
||||
}
|
||||
|
||||
class PicardFixMates extends PicardBamJarFunction {
|
||||
@Input(doc="input bam files") var bams: List[File] = Nil
|
||||
@Output(doc="output bam file") var outBam: File = null
|
||||
|
||||
def inputBams: List[File] = bams
|
||||
def outputBam: File = outBam
|
||||
|
||||
}
|
||||
|
||||
|
||||
def swapExt(file: File, oldExtension: String, newExtension: String) =
|
||||
new File(file.getName.stripSuffix(oldExtension) + newExtension)
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
|
||||
trait QPipeline {
|
||||
var commands: List[CommandLineFunction] = Nil
|
||||
|
||||
def generateCommands
|
||||
|
||||
def track // todo -- implement me
|
||||
|
||||
def removeIntermediate // todo -- implement me
|
||||
}
|
||||
|
|
@ -0,0 +1,340 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import java.io.File
|
||||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||
import org.broadinstitute.sting.queue.extensions.samtools._
|
||||
import org.broadinstitute.sting.queue.{QException, QScript}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
||||
class VariantCalling(yaml: File,gatkJar: File) {
|
||||
vc =>
|
||||
|
||||
// load attributes
|
||||
var attributes: Pipeline = YamlUtils.load(classOf[Pipeline], yaml)
|
||||
|
||||
/**
|
||||
* Trait to propagate basic attributes throughout the library
|
||||
*/
|
||||
trait StandardCommandLineGATK extends CommandLineGATK {
|
||||
this.reference_sequence = vc.attributes.getProject.getReferenceFile
|
||||
this.intervals = vc.attributes.getProject.getIntervalList
|
||||
this.DBSNP = vc.attributes.getProject.getDbsnpFile
|
||||
// set global memory limit on the low side. Additional input bams will affect it.
|
||||
this.memoryLimit = Some(2)
|
||||
this.jarFile = vc.gatkJar
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file
|
||||
* @Return: UnifiedGenotyper with the standard GSA arguments
|
||||
* @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information
|
||||
*/
|
||||
protected def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = {
|
||||
var ug = new UnifiedGenotyper with StandardCommandLineGATK
|
||||
ug.analysisName = "UnifiedGenotyper"
|
||||
ug.input_file = bams
|
||||
ug.group :+= "Standard"
|
||||
ug.out = output
|
||||
ug.min_base_quality_score = Some(10)
|
||||
ug.min_mapping_quality_score = Some(10)
|
||||
ug.cap_base_quality_by_mapping_quality = true
|
||||
ug.standard_min_confidence_threshold_for_emitting = Some(10)
|
||||
ug.standard_min_confidence_threshold_for_calling = Some(30)
|
||||
ug.trigger_min_confidence_threshold_for_calling = Some(0)
|
||||
ug.downsample_to_coverage = Some(300)
|
||||
ug.dt = Some(DownsampleType.BY_SAMPLE)
|
||||
ug.scatterCount = 50
|
||||
|
||||
if ( bams.size > 125 ) {
|
||||
ug.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
return ug
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file
|
||||
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments
|
||||
*/
|
||||
protected def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = {
|
||||
var ig = new IndelGenotyperV2 with StandardCommandLineGATK
|
||||
ig.analysisName = "IndelGenotyper"
|
||||
ig.input_file :+= bam
|
||||
ig.out = output
|
||||
ig.downsample_to_coverage = Some(300)
|
||||
return ig
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline
|
||||
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job
|
||||
*/
|
||||
private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = {
|
||||
var ig = StandardIndelGenotyper(bam,output)
|
||||
ig.isGather = gather
|
||||
return ig
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Combines a list of indel VCFs to a single output file
|
||||
* @Returns: A CombineVariants CLF with standard GSA arguments
|
||||
*/
|
||||
protected def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = {
|
||||
var cv = new CombineVariants with StandardCommandLineGATK
|
||||
cv.out = output
|
||||
cv.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY)
|
||||
cv.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
|
||||
cv.analysisName = "IndelGenotyper"
|
||||
cv.isGather = true
|
||||
cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",")
|
||||
//cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",")
|
||||
cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out))
|
||||
|
||||
return cv
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline.
|
||||
* @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names.
|
||||
*/
|
||||
protected def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = {
|
||||
var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false)))
|
||||
var combine = this.StandardIndelCombine( genotypers, output )
|
||||
var callFunctions: List[CommandLineGATK] = genotypers
|
||||
callFunctions = combine :: callFunctions
|
||||
|
||||
return callFunctions
|
||||
}
|
||||
|
||||
protected def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = {
|
||||
var iFil = new VariantFiltration with StandardCommandLineGATK
|
||||
iFil.analysisName = "FilterAtIndels"
|
||||
iFil.out = output
|
||||
iFil.mask = indels.getAbsolutePath
|
||||
iFil.maskName = "Indel_Mask"
|
||||
iFil.variantVCF = snps
|
||||
// todo -- cluster size varies with # bams
|
||||
iFil.clusterSize = Some(5)
|
||||
iFil.clusterWindowSize = Some(8)
|
||||
|
||||
return iFil
|
||||
}
|
||||
|
||||
protected def StandardHandfilter( snps: File, output: File ) : VariantFiltration = {
|
||||
var hFil = new VariantFiltration with StandardCommandLineGATK
|
||||
hFil.analysisName = "HandFilter"
|
||||
hFil.out = output
|
||||
hFil.variantVCF = snps
|
||||
hFil.filterExpression :+= "QD<5"
|
||||
hFil.filterName :+= "LowQualByDepth"
|
||||
hFil.filterExpression :+= "AB>0.75"
|
||||
hFil.filterName :+= "HighAlleleBalance"
|
||||
hFil.filterExpression :+= "SB>-0.10"
|
||||
hFil.filterName :+= "HighStrandBias"
|
||||
|
||||
return hFil
|
||||
}
|
||||
|
||||
protected def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = {
|
||||
var genC = new GenerateVariantClusters with StandardCommandLineGATK
|
||||
genC.analysisName = "VariantQualityRecalibration"
|
||||
genC.rodBind :+= new RodBind("input","VCF",snps)
|
||||
genC.cluster_file = output
|
||||
genC.use_annotation :+= "QD"
|
||||
genC.use_annotation :+= "SB"
|
||||
genC.use_annotation :+= "HaplotypeScore"
|
||||
genC.use_annotation :+= "AB"
|
||||
genC.use_annotation :+= "HRun"
|
||||
|
||||
return genC
|
||||
}
|
||||
|
||||
protected def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File,
|
||||
out_tranches: File, out_dat: File) : VariantRecalibrator = {
|
||||
var vr = new VariantRecalibrator with StandardCommandLineGATK
|
||||
vr.analysisName = "VariantQualityRecalibration"
|
||||
vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
|
||||
vr.cluster_file = cluster
|
||||
vr.target_titv = target_titv
|
||||
vr.out = out_vcf
|
||||
vr.tranches_file = out_tranches
|
||||
vr.report_dat_file = out_dat
|
||||
vr.tranche :+= "0.1"
|
||||
vr.tranche :+= "1"
|
||||
vr.tranche :+= "5"
|
||||
vr.tranche :+= "10"
|
||||
|
||||
return vr
|
||||
|
||||
}
|
||||
|
||||
protected def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = {
|
||||
var avc = new ApplyVariantCuts with StandardCommandLineGATK
|
||||
avc.analysisName = "VariantQualityRecalibration"
|
||||
avc.rodBind :+= new RodBind("input","VCF",snpRecal)
|
||||
avc.out = output
|
||||
avc.tranches_file = tranches
|
||||
avc.fdr_filter_level = Some(5)
|
||||
|
||||
return avc
|
||||
}
|
||||
|
||||
protected def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = {
|
||||
var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster"))
|
||||
var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"),
|
||||
swapExt(snps,".vcf",".recal.tranch"),swapExt(snps,".vcf",".recal.dat"))
|
||||
var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,recal.out)
|
||||
|
||||
var cmds: List[CommandLineGATK] = Nil
|
||||
cmds :+= clust
|
||||
cmds :+= recal
|
||||
cmds :+= cut
|
||||
|
||||
return cmds
|
||||
}
|
||||
|
||||
protected def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = {
|
||||
var ga = new GenomicAnnotator with StandardCommandLineGATK
|
||||
ga.analysisName = "GenomicAnnotator"
|
||||
ga.variantVCF = snps
|
||||
ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile)
|
||||
ga.rodToIntervalTrackName = "variant"
|
||||
ga.out = outputVCF
|
||||
|
||||
return ga
|
||||
}
|
||||
|
||||
protected def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
|
||||
var commands : List[CommandLineGATK] = Nil
|
||||
var dir = ""
|
||||
|
||||
if ( output.getParent != null ) {
|
||||
dir = output.getParent+"/"
|
||||
}
|
||||
|
||||
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
|
||||
var ug = StandardUnifiedGenotyper(bams, raw_snp)
|
||||
|
||||
commands :+= ug
|
||||
|
||||
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
|
||||
var ig = StandardIndelCalls(bams,raw_indel)
|
||||
|
||||
commands ++= ig
|
||||
|
||||
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
|
||||
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
|
||||
|
||||
commands :+= iFilt
|
||||
|
||||
var annoSNP = prefilt_snp
|
||||
if ( refGene != null ) {
|
||||
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
|
||||
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
|
||||
|
||||
commands :+= annotate
|
||||
}
|
||||
|
||||
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output)
|
||||
|
||||
commands ++= recal
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
protected def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
|
||||
var commands : List[CommandLineGATK] = Nil
|
||||
var dir = ""
|
||||
|
||||
if ( recalOut.getParent != null ) {
|
||||
dir = recalOut.getParent+"/"
|
||||
}
|
||||
|
||||
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
|
||||
var ug = StandardUnifiedGenotyper(bams, raw_snp)
|
||||
|
||||
commands :+= ug
|
||||
|
||||
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
|
||||
var ig = StandardIndelCalls(bams,raw_indel)
|
||||
|
||||
commands ++= ig
|
||||
|
||||
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
|
||||
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
|
||||
|
||||
commands :+= iFilt
|
||||
|
||||
var annoSNP = prefilt_snp
|
||||
if ( refGene != null ) {
|
||||
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
|
||||
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
|
||||
|
||||
commands :+= annotate
|
||||
}
|
||||
|
||||
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
|
||||
|
||||
commands ++= recal
|
||||
|
||||
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
|
||||
|
||||
commands :+= handFilt
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
protected def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
|
||||
var commands : List[CommandLineGATK] = Nil
|
||||
var dir = ""
|
||||
|
||||
if ( recalOut.getParent != null ) {
|
||||
dir = recalOut.getParent+"/"
|
||||
}
|
||||
|
||||
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
|
||||
var ug = StandardUnifiedGenotyper(bams, raw_snp)
|
||||
|
||||
commands :+= ug
|
||||
|
||||
var raw_indel = indelOut
|
||||
var ig = StandardIndelCalls(bams,raw_indel)
|
||||
|
||||
commands ++= ig
|
||||
|
||||
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
|
||||
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
|
||||
|
||||
commands :+= iFilt
|
||||
|
||||
var annoSNP = prefilt_snp
|
||||
if ( refGene != null ) {
|
||||
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
|
||||
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
|
||||
|
||||
commands :+= annotate
|
||||
}
|
||||
|
||||
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
|
||||
|
||||
commands ++= recal
|
||||
|
||||
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
|
||||
|
||||
commands :+= handFilt
|
||||
|
||||
return commands
|
||||
}
|
||||
|
||||
def swapExt(file: File, oldExtension: String, newExtension: String) =
|
||||
new File(file.getName.stripSuffix(oldExtension) + newExtension)
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
package org.broadinstitute.sting.queue.util
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
|
||||
class PipelineUtils {
|
||||
|
||||
}
|
||||
|
||||
object PipelineUtils{
|
||||
|
||||
def smartSplitContigs(reference: File, intervals: File, sets: Int) : List[List[String]] = {
|
||||
GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(reference))
|
||||
val targets = IntervalUtils.parseIntervalArguments(List(intervals.getAbsolutePath), false)
|
||||
|
||||
// Build up a map of contigs with sizes.
|
||||
var contigSizes = Map.empty[String, Long]
|
||||
// todo -- make this look like functional code for Q's sake
|
||||
//targets.foreach( loc => { contigSizes += loc -> { contigSizes.get(loc.getContig) match { case Some(size) => size + loc.size case None => loc.size } } })
|
||||
|
||||
for (loc <- targets) {
|
||||
val contig = loc.getContig
|
||||
val contigSize = loc.size
|
||||
contigSizes += contig -> {
|
||||
contigSizes.get(contig) match {
|
||||
case Some(size) => size + contigSize
|
||||
case None => contigSize
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keep a list of pairs of sizes with lists of contigs.
|
||||
var splitContigs = List.empty[(Long, List[String])]
|
||||
for ((contigName, contigSize) <- contigSizes) {
|
||||
if (splitContigs.size < sets) {
|
||||
// If there are fewer than the requested number of sets, just add this contig.
|
||||
splitContigs :+= contigSize -> List(contigName)
|
||||
} else {
|
||||
// If there is already a number of sets
|
||||
// sort the contigs to get the smallest one first.
|
||||
splitContigs = splitContigs.sortBy{case (size, contigs) => size}
|
||||
// Update the pair with the new contig size and name.
|
||||
var smallContigs = splitContigs.head
|
||||
smallContigs = (smallContigs._1 + contigSize) -> (smallContigs._2 :+ contigName)
|
||||
// Re add the pair to the list.
|
||||
splitContigs = smallContigs :: splitContigs.tail
|
||||
}
|
||||
}
|
||||
|
||||
splitContigs.map{case (size, contigs) => contigs}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue