From 5974675b43fc10e3d2490907ba5110d22a7bb1e4 Mon Sep 17 00:00:00 2001 From: carneiro Date: Fri, 27 May 2011 22:03:08 +0000 Subject: [PATCH] Two intermediate commits, to work over the weekend. ReplicationValidationWalker: Just the skeleton of what will be the implementation of the replication/validation model. dataProcessingV2: Committing an UNTESTED implementation of BWA alignment. I am running tests on it over the weekend. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5900 348d0f76-0448-11de-a6fe-93d51630548a --- .../ReplicationValidationWalker.java | 57 ++++++++++++ .../oneoffs/carneiro/dataProcessingV2.scala | 92 ++++++++++++++----- 2 files changed, 125 insertions(+), 24 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java new file mode 100755 index 000000000..55fc362f2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/replication_validation/ReplicationValidationWalker.java @@ -0,0 +1,57 @@ +package org.broadinstitute.sting.playground.gatk.walkers.replication_validation; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; + +import java.io.PrintStream; + +/** + * Implementation of the replication and validation framework with reference based error model + * for pooled sequencing. + * + * The input should be a BAM file with pooled sequencing data where each pool is represented by + * samples with the same barcode. + * + * A reference sample name must be provided and it must be barcoded uniquely. + */ +public class ReplicationValidationWalker extends LocusWalker implements TreeReducible { + @Argument(shortName="refSample", fullName="reference_sample_name", doc="reference sample name", required=true) + String referenceSampleName = "NA12878"; + + @Output(doc="Write output to this file instead of STDOUT") + PrintStream out; + + public void initialize() { + return; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + // todo -- Locate the reference samples for each lane + // todo -- Locate each pool and assign a reference sample to it + return 1; + } + + public Long reduceInit() { return 0l; } + + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + /** + * Reduces two subtrees together. In this case, the implementation of the tree reduce + * is exactly the same as the implementation of the single reduce. + */ + public Long treeReduce(Long lhs, Long rhs) { + return lhs + rhs; + } + + public void onTraversalDone( Long c ) { + out.println(c); + } +} diff --git a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala index 6462df8c4..b89a02dd7 100755 --- a/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala +++ b/scala/qscript/oneoffs/carneiro/dataProcessingV2.scala @@ -10,6 +10,7 @@ import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord} import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.queue.function.scattergather.ScatterFunction +import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter class dataProcessingV2 extends QScript { @@ -60,6 +61,9 @@ class dataProcessingV2 extends QScript { @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ + @Input(doc="The path to the binary of samtools (required if using BWA)", fullName="path_to_sam", shortName="sam", required=false) + var samPath: File = _ + @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=false) var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") @@ -111,8 +115,8 @@ class dataProcessingV2 extends QScript { var sample: String = "" for (r <- readGroups) { if (sample.isEmpty) - sample = r.getSample() - else if (sample != r.getSample()) + sample = r.getSample + else if (sample != r.getSample) return true; } return false @@ -126,8 +130,8 @@ class dataProcessingV2 extends QScript { val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] for (bam <- bamFiles) { val samReader = new SAMFileReader(bam) - val header = samReader.getFileHeader() - val readGroups = header.getReadGroups() + val header = samReader.getFileHeader + val readGroups = header.getReadGroups // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file // with PrintReads to separate the samples. Tell user to do it himself! @@ -135,7 +139,7 @@ class dataProcessingV2 extends QScript { // Fill out the sample table with the readgroups in this file for (rg <- readGroups) { - val sample = rg.getSample() + val sample = rg.getSample if (!sampleTable.contains(sample)) sampleTable(sample) = List(bam) else if ( !sampleTable(sample).contains(bam)) @@ -160,10 +164,46 @@ class dataProcessingV2 extends QScript { } - // Takes a list of processed BAM files, revert them to unprocessed and realigns each lane, producing a list of - // per-lane aligned bam files, ready to be processed. - def performAlignment(bamList: File): List[File] = { - return List() + def buildReadGroupString(samReader: SAMFileReader): String = { + val readGroups = samReader.getFileHeader.getReadGroups + var s = "" + for (rg <- readGroups) { + if (s != "") { + s = s + "\n" + } + s = s + "@RG\t" + + SAMReadGroupRecord.READ_GROUP_ID_TAG + ":" + rg.getReadGroupId + "\t" + + SAMReadGroupRecord.PLATFORM_TAG + ":" + rg.getPlatformUnit + "\t" + + SAMReadGroupRecord.LIBRARY_TAG + ":" + rg.getLibrary + "\t" + + SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG + ":" + rg.getSample + "\t" + + SAMReadGroupRecord.SEQUENCING_CENTER_TAG + ":" + rg.getSequencingCenter + "\t" + } + return s +} + + // Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape). + // Returns a list of realigned BAM files. + def performAlignment(bamMap: Map[String, File]) { + for (key <- bamMap.keysIterator) { + val bam: File = bamMap(key) + val saiFile1 = swapExt(bam, ".bam", "1.sai") + val saiFile2 = swapExt(bam, ".bam", "2.sai") + val realignedSamFile = swapExt(bam, ".bam", ".realigned.sam") + val realignedBamFile = swapExt(bam, ".bam", ".realigned.bam") + val readGroupString: String = buildReadGroupString(new SAMFileReader()) + if (useBWAse) { + add(bwa_aln_se(bam, saiFile1), + bwa_sam_se(bam, saiFile1, realignedSamFile, readGroupString), + to_bam(realignedSamFile, realignedBamFile)) + } + else if (useBWApe) { + add(bwa_aln_pe(bam, saiFile1), + bwa_aln_pe(bam, saiFile2), + bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile, readGroupString), + to_bam(realignedSamFile, realignedBamFile)) + } + bamMap(key) = realignedBamFile + } } def createListFromFile(in: File):List[File] = { @@ -184,14 +224,18 @@ class dataProcessingV2 extends QScript { def script = { - //todo -- (option - BWA) run BWA on each bam file (per lane bam file) before performing per sample processing - val perLaneAlignedBamFiles: List[File] = if (useBWApe || useBWAse) { performAlignment(input) } else { createListFromFile(input) } + // keep a record of the number of contigs in the first bam file in the list + val bams = createListFromFile(input) + nContigs = getNumberOfContigs(bams(0)) // Generate a BAM file per sample joining all per lane files if necessary - val sampleBamFiles = createSampleFiles(perLaneAlignedBamFiles) + val sampleBamFiles: Map[String, File] = createSampleFiles(bams) + + //todo -- (option - BWA) run BWA on each bam file (per lane bam file) before performing per sample processing + if (useBWApe || useBWAse) { + performAlignment(sampleBamFiles) + } - // keep a record of the number of contigs in the first bam file in the list - nContigs = getNumberOfContigs(perLaneAlignedBamFiles(0)) println("nContigs: " + nContigs) @@ -408,7 +452,7 @@ class dataProcessingV2 extends QScript { this.jobName = queueLogDir + outSai + ".bwa_aln_se" } - case class bwa_aln_pe1 (inBam: File, outSai1: File) extends CommandLineFunction { + case class bwa_aln_pe (inBam: File, outSai1: File) extends CommandLineFunction { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file for 1st mating pair") var sai = outSai1 def commandLine = bwaPath + " aln -q 5 " + reference + " -b1 " + bam + " > " + sai @@ -417,15 +461,6 @@ class dataProcessingV2 extends QScript { this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" } - case class bwa_aln_pe2 (inBam: File, outSai2: File) extends CommandLineFunction { - @Input(doc="bam file to be aligned") var bam = inBam - @Output(doc="output sai file for 2nd mating pair") var sai = outSai2 - def commandLine = bwaPath + " aln -q 5 " + reference + " -b2 " + bam + " > " + sai - this.isIntermediate = true - this.analysisName = queueLogDir + outSai2 + ".bwa_aln_pe2" - this.jobName = queueLogDir + outSai2 + ".bwa_aln_pe2" - } - case class bwa_sam_se (inBam: File, inSai: File, outBam: File, readGroup: String) extends CommandLineFunction { @Input(doc="bam file to be aligned") var bam = inBam @Input(doc="bwa alignment index file") var sai = inSai @@ -447,6 +482,15 @@ class dataProcessingV2 extends QScript { this.jobName = queueLogDir + outBam + ".bwa_sam_pe" } + case class to_bam (inSam: File, outBam: File) extends CommandLineFunction { + @Input(doc="sam file to become bam") var sam = inSam + @Output(doc="bam file") var bam = outBam + def commandLine = samPath + " view -b " + sam + " > " + bam + this.isIntermediate = true + this.analysisName = queueLogDir + outBam + ".to_bam" + this.jobName = queueLogDir + outBam + ".to_bam" + } + case class writeList(inBams: List[File], outBamList: File) extends ListWriterFunction { this.inputFiles = inBams this.listFile = outBamList