Absolutely nothing to get excited about. This is just the skeleton for the local assembler. It doesn't do anything at all now except for collect reads over each -L interval and pass them to an assembly engine (which isn't implemented yet). The interface for the AssemblyEngine will change later, but for now this one is the most conducive to debugging.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5441 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2011-03-14 20:31:54 +00:00
parent 6d09cdd4bc
commit 239dae0985
3 changed files with 222 additions and 0 deletions

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import java.io.PrintStream;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 14, 2011
*/
public abstract class LocalAssemblyEngine {
public enum ASSEMBLER {
SIMPLE_DE_BRUIJN
}
private PrintStream out;
private IndexedFastaSequenceFile referenceReader;
protected LocalAssemblyEngine(PrintStream out, IndexedFastaSequenceFile referenceReader) {
this.out = out;
this.referenceReader = referenceReader;
}
protected PrintStream getOutputStream() { return out; }
protected IndexedFastaSequenceFile getReferenceReader() { return referenceReader; }
public abstract void runLocalAssembly(List<SAMRecord> reads);
}

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import java.io.PrintStream;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 14, 2011
*/
public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// minimum base quality required in a contiguous stretch of a given read to be used in the assembly
private static final int MIN_BASE_QUAL_TO_USE = 20;
// reference base padding size
private static final int REFERENCE_PADDING = 30;
public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader) {
super(out, referenceReader);
}
public void runLocalAssembly(List<SAMRecord> reads) {
createDeBruijnGraph(reads);
assignReadsToGraph(reads);
}
private void createDeBruijnGraph(List<SAMRecord> reads) {
// TODO -- implement me
}
private void assignReadsToGraph(List<SAMRecord> reads) {
// TODO -- implement me
}
}

View File

@ -0,0 +1,145 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.*;
import java.util.*;
/**
* Performs local assembly. Not to be used yet.
*/
public class WindowedAssemblyWalker extends ReadWalker<SAMRecord, Integer> {
protected static final int MIN_MAPPING_QUALITY = 20;
@Output(doc="Base-space graph output", required=true)
protected PrintStream graphWriter = null;
@Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false)
protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN;
// the assembly engine
LocalAssemblyEngine assemblyEngine = null;
// the intervals input by the user
private Iterator<GenomeLoc> intervals = null;
// the current interval in the list
private GenomeLoc currentInterval = null;
// the reads that fall into the current interval
private final ArrayList<SAMRecord> readsToAssemble = new ArrayList<SAMRecord>();
public void initialize() {
IndexedFastaSequenceFile referenceReader;
try {
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
}
catch(FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex);
}
assemblyEngine = makeAssembler(ASSEMBLER_TO_USE, referenceReader);
GenomeLocSortedSet intervalsToAssemble = getToolkit().getIntervals();
if ( intervalsToAssemble == null || intervalsToAssemble.isEmpty() )
throw new UserException.BadInput("Intervals must be provided with -L or -BTI (preferably not larger than several hundred bp)");
intervals = intervalsToAssemble.clone().iterator();
currentInterval = intervals.hasNext() ? intervals.next() : null;
}
private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) {
switch ( type ) {
case SIMPLE_DE_BRUIJN:
return new SimpleDeBruijnAssembler(graphWriter, referenceReader);
default:
throw new UserException.BadInput("Assembler type " + type + " is not valid/supported");
}
}
public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
return currentInterval == null || doNotTryToClean(read) ? null : read;
}
private boolean doNotTryToClean(SAMRecord read) {
return read.getNotPrimaryAlignmentFlag() ||
read.getReadFailsVendorQualityCheckFlag() ||
read.getDuplicateReadFlag() ||
read.getMappingQuality() < MIN_MAPPING_QUALITY;
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(SAMRecord read, Integer sum) {
if ( read == null )
return sum;
GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read);
// hack to get around unmapped reads having screwy locations
if ( readLoc.getStop() == 0 )
readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
if ( readLoc.overlapsP(currentInterval) ) {
readsToAssemble.add(read);
} else {
assemblyEngine.runLocalAssembly(readsToAssemble);
readsToAssemble.clear();
sum++;
do {
currentInterval = intervals.hasNext() ? intervals.next() : null;
} while ( currentInterval != null && currentInterval.isBefore(readLoc) );
}
return sum;
}
public void onTraversalDone(Integer result) {
if ( readsToAssemble.size() > 0 ) {
assemblyEngine.runLocalAssembly(readsToAssemble);
result++;
}
logger.info("Ran local assembly on " + result + " intervals");
}
}