diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/LocalAssemblyEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/LocalAssemblyEngine.java new file mode 100755 index 000000000..0640d5600 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/LocalAssemblyEngine.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.playground.gatk.walkers.assembly; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMRecord; + +import java.io.PrintStream; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 14, 2011 + */ +public abstract class LocalAssemblyEngine { + + public enum ASSEMBLER { + SIMPLE_DE_BRUIJN + } + + private PrintStream out; + private IndexedFastaSequenceFile referenceReader; + + protected LocalAssemblyEngine(PrintStream out, IndexedFastaSequenceFile referenceReader) { + this.out = out; + this.referenceReader = referenceReader; + } + + protected PrintStream getOutputStream() { return out; } + + protected IndexedFastaSequenceFile getReferenceReader() { return referenceReader; } + + public abstract void runLocalAssembly(List reads); + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java new file mode 100755 index 000000000..e40be059e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.playground.gatk.walkers.assembly; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMRecord; + +import java.io.PrintStream; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 14, 2011 + */ +public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { + + // minimum base quality required in a contiguous stretch of a given read to be used in the assembly + private static final int MIN_BASE_QUAL_TO_USE = 20; + + // reference base padding size + private static final int REFERENCE_PADDING = 30; + + public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader) { + super(out, referenceReader); + } + + public void runLocalAssembly(List reads) { + createDeBruijnGraph(reads); + + assignReadsToGraph(reads); + } + + private void createDeBruijnGraph(List reads) { + + // TODO -- implement me + + } + + private void assignReadsToGraph(List reads) { + + // TODO -- implement me + + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java new file mode 100755 index 000000000..29aaa4e83 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.assembly; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.*; +import java.util.*; + +/** + * Performs local assembly. Not to be used yet. + */ +public class WindowedAssemblyWalker extends ReadWalker { + + protected static final int MIN_MAPPING_QUALITY = 20; + + + @Output(doc="Base-space graph output", required=true) + protected PrintStream graphWriter = null; + + @Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false) + protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN; + + // the assembly engine + LocalAssemblyEngine assemblyEngine = null; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + + // the reads that fall into the current interval + private final ArrayList readsToAssemble = new ArrayList(); + + + public void initialize() { + + IndexedFastaSequenceFile referenceReader; + try { + // fasta reference reader to supplement the edges of the reference sequence + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); + } + + assemblyEngine = makeAssembler(ASSEMBLER_TO_USE, referenceReader); + + GenomeLocSortedSet intervalsToAssemble = getToolkit().getIntervals(); + if ( intervalsToAssemble == null || intervalsToAssemble.isEmpty() ) + throw new UserException.BadInput("Intervals must be provided with -L or -BTI (preferably not larger than several hundred bp)"); + + intervals = intervalsToAssemble.clone().iterator(); + currentInterval = intervals.hasNext() ? intervals.next() : null; + } + + private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) { + switch ( type ) { + case SIMPLE_DE_BRUIJN: + return new SimpleDeBruijnAssembler(graphWriter, referenceReader); + default: + throw new UserException.BadInput("Assembler type " + type + " is not valid/supported"); + } + } + + public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + return currentInterval == null || doNotTryToClean(read) ? null : read; + } + + private boolean doNotTryToClean(SAMRecord read) { + return read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getDuplicateReadFlag() || + read.getMappingQuality() < MIN_MAPPING_QUALITY; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(SAMRecord read, Integer sum) { + if ( read == null ) + return sum; + + GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.overlapsP(currentInterval) ) { + readsToAssemble.add(read); + } else { + assemblyEngine.runLocalAssembly(readsToAssemble); + readsToAssemble.clear(); + sum++; + + do { + currentInterval = intervals.hasNext() ? intervals.next() : null; + } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); + } + + return sum; + } + + public void onTraversalDone(Integer result) { + if ( readsToAssemble.size() > 0 ) { + assemblyEngine.runLocalAssembly(readsToAssemble); + result++; + } + logger.info("Ran local assembly on " + result + " intervals"); + } +} \ No newline at end of file