Test version of consensus compressing strategy. Cannot be used, and is being rewritten right now
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5605 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
80d547ae71
commit
866f4fd569
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: April 7, 2011
|
||||
*/
|
||||
public class ReduceReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
@Output
|
||||
protected StingSAMFileWriter out;
|
||||
|
||||
@Argument(fullName = "SNPContextSize", shortName = "SCS", doc = "", required = true)
|
||||
protected int SNPContextSize;
|
||||
|
||||
@Argument(fullName = "IndelContextSize", shortName = "ICS", doc = "", required = true)
|
||||
protected int IndelContextSize;
|
||||
|
||||
protected ReducingSAMFileWriter reducingOut;
|
||||
protected int totalReads = 0;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
reducingOut = new ReducingSAMFileWriter(out, SNPContextSize, IndelContextSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
for ( GATKFeature feature : metaDataTracker.getAllCoveringRods() ) {
|
||||
if ( feature.getUnderlyingObject() instanceof VariantContext ) {
|
||||
VariantContext vc = (VariantContext)feature.getUnderlyingObject();
|
||||
reducingOut.addVariant(vc);
|
||||
}
|
||||
}
|
||||
|
||||
totalReads++;
|
||||
return read; // all the work is done in the reduce step for this walker
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* reduceInit is called once before any calls to the map function. We use it here to setup the output
|
||||
* bam file, if it was specified on the command line
|
||||
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
|
||||
*/
|
||||
@Override
|
||||
public SAMFileWriter reduceInit() {
|
||||
return reducingOut;
|
||||
}
|
||||
|
||||
/**
|
||||
* given a read and a output location, reduce by emitting the read
|
||||
* @param read the read itself
|
||||
* @param output the output source
|
||||
* @return the SAMFileWriter, so that the next reduce can emit to the same source
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
output.addAlignment(read);
|
||||
return output;
|
||||
}
|
||||
|
||||
public void onTraversalDone( SAMFileWriter reduceResult ) {
|
||||
logger.info("Compressed reads: " + reducingOut.getNCompressedReads());
|
||||
logger.info("Total reads : " + totalReads);
|
||||
// todo -- fixme
|
||||
//reducingOut.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,287 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.picard.sam.SamPairUtil;
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
//import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
*/
|
||||
public class ReducingSAMFileWriter implements SAMFileWriter {
|
||||
protected static final Logger logger = Logger.getLogger(ReducingSAMFileWriter.class);
|
||||
private static final boolean DEBUG = false;
|
||||
private static final boolean INVERT = false;
|
||||
private static final boolean PRINT_CONSENSUS_READS = false;
|
||||
|
||||
/** The place where we ultimately write out our records */
|
||||
final SAMFileWriter finalDestination;
|
||||
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
|
||||
LinkedList<SAMRecord> consensusReads = new LinkedList<SAMRecord>();
|
||||
final int IndelContextSize;
|
||||
final int SNPContextSize;
|
||||
Set<VariantContext> VCs = new HashSet<VariantContext>();
|
||||
|
||||
public long getNCompressedReads() {
|
||||
return nCompressedReads;
|
||||
}
|
||||
|
||||
long nCompressedReads = 0;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param header
|
||||
* @param outputFile
|
||||
* @param compressionLevel
|
||||
*/
|
||||
public ReducingSAMFileWriter(final SAMFileHeader header,
|
||||
final File outputFile,
|
||||
final int compressionLevel,
|
||||
final int SNPContextSize,
|
||||
final int IndelContextSize) {
|
||||
this(new SAMFileWriterFactory().makeBAMWriter(header, true, outputFile, compressionLevel),
|
||||
SNPContextSize,
|
||||
IndelContextSize);
|
||||
}
|
||||
|
||||
public ReducingSAMFileWriter(final SAMFileWriter finalDestination,
|
||||
final int SNPContextSize,
|
||||
final int IndelContextSize) {
|
||||
this.finalDestination = finalDestination;
|
||||
this.IndelContextSize = IndelContextSize;
|
||||
this.SNPContextSize = SNPContextSize;
|
||||
}
|
||||
|
||||
public int getIndelContextSize() {
|
||||
return IndelContextSize;
|
||||
}
|
||||
|
||||
public int getSNPContextSize() {
|
||||
return SNPContextSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the header to use when creating the new SAM file.
|
||||
* @return header to use when creating the new SAM file.
|
||||
*/
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return finalDestination.getFileHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
public void addAlignment( SAMRecord newRead ) {
|
||||
if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart() + " OP = " + newRead.getAttribute("OP"));
|
||||
|
||||
// if the new read is on a different contig, then we need to flush the queue and clear the map
|
||||
if ( waitingReads.size() > 0 && waitingReads.peek().getReferenceIndex() != newRead.getReferenceIndex()) {
|
||||
if ( DEBUG ) logger.warn("Flushing queue on move to new contig: " + newRead.getReferenceName());
|
||||
|
||||
// TODO -- fixme
|
||||
while ( ! waitingReads.isEmpty() ) {
|
||||
// emit to disk
|
||||
finalDestination.addAlignment(waitingReads.remove());
|
||||
}
|
||||
}
|
||||
|
||||
waitingReads.add(newRead);
|
||||
emitReadsIfPossible(newRead.getAlignmentStart());
|
||||
}
|
||||
|
||||
public void addVariant(VariantContext vc) {
|
||||
VCs.add(vc);
|
||||
}
|
||||
|
||||
private void emitReadsIfPossible(int alignmentStartOfLastRead) {
|
||||
//
|
||||
// 2 states:
|
||||
// -- reads ending << vc.getLocation()
|
||||
// -- read overlapping vc.getLocation() +/- context size
|
||||
//
|
||||
while ( ! waitingReads.isEmpty() ) { // there's something in the queue
|
||||
SAMRecord read = waitingReads.peek();
|
||||
if ( ! withinContextOfVariants(read, VCs) ) {
|
||||
addToConsensus(read);
|
||||
waitingReads.remove();
|
||||
} else if ( cannotOverlapFutureVC(read, alignmentStartOfLastRead) ) {
|
||||
emitConsensus();
|
||||
if ( ! INVERT ) finalDestination.addAlignment(read);
|
||||
waitingReads.remove();
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addToConsensus(SAMRecord read) {
|
||||
if ( ! read.getDuplicateReadFlag() && ! read.getNotPrimaryAlignmentFlag() && ! read.getReadUnmappedFlag() )
|
||||
consensusReads.add(read);
|
||||
}
|
||||
|
||||
private void emitConsensus() {
|
||||
if ( ! consensusReads.isEmpty() ) {
|
||||
SAMRecord firstRead = consensusReads.peek();
|
||||
|
||||
int start = firstRead.getAlignmentStart();
|
||||
int end = furtherestEnd(consensusReads);
|
||||
int len = end - start + 1;
|
||||
|
||||
int[][] baseCounts = new int[len][4];
|
||||
for ( SAMRecord read : consensusReads ) {
|
||||
int readI = 0, refI = read.getAlignmentStart() - start;
|
||||
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
|
||||
int l = elt.getLength();
|
||||
switch (elt.getOperator()) {
|
||||
case N: // cannot handle these
|
||||
break;
|
||||
case H : case P : // ignore pads and hard clips
|
||||
break;
|
||||
case S : refI += l; // move the reference too, in addition to I
|
||||
case I :
|
||||
// todo - handle insertions?
|
||||
readI += l;
|
||||
break;
|
||||
case D :
|
||||
refI += l;
|
||||
break;
|
||||
case M :
|
||||
while (readI < l) {
|
||||
byte base = read.getReadBases()[readI++];
|
||||
int baseI = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( baseI >= 0 ) // no Ns
|
||||
baseCounts[refI][baseI]++;
|
||||
refI++;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
byte[] bases = new byte[len];
|
||||
byte[] quals = new byte[len];
|
||||
for ( int i = 0; i < len; i++) {
|
||||
final int maxI = maxBaseCount(baseCounts[i]);
|
||||
final int count = baseCounts[i][maxI];
|
||||
final byte base = count == 0 ? (byte)'N' : BaseUtils.baseIndexToSimpleBase(maxI);
|
||||
bases[i] = base;
|
||||
quals[i] = QualityUtils.boundQual(count, (byte)64);
|
||||
}
|
||||
|
||||
SAMRecord consensus = new SAMRecord(firstRead.getHeader());
|
||||
consensus.setReferenceIndex(firstRead.getReferenceIndex());
|
||||
consensus.setReadName("Mark");
|
||||
consensus.setCigarString(String.format("%dM", len));
|
||||
consensus.setReadPairedFlag(false);
|
||||
consensus.setAlignmentStart(start);
|
||||
consensus.setReadBases(bases);
|
||||
consensus.setBaseQualities(quals);
|
||||
consensus.setMappingQuality(60);
|
||||
|
||||
int nConsensus = consensusReads.size();
|
||||
nCompressedReads += nConsensus;
|
||||
logger.info(String.format("Compressing %5d reads into a single consensus", nConsensus));
|
||||
finalDestination.addAlignment(consensus);
|
||||
|
||||
if ( INVERT && PRINT_CONSENSUS_READS )
|
||||
for ( SAMRecord read : consensusReads )
|
||||
finalDestination.addAlignment(read);
|
||||
|
||||
consensusReads.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private static int furtherestEnd(Collection<SAMRecord> reads) {
|
||||
int end = -1;
|
||||
for ( SAMRecord read : reads ) {
|
||||
end = Math.max(end, read.getAlignmentEnd());
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
private int maxBaseCount(int[] counts) {
|
||||
int maxI = 0;
|
||||
for ( int i = 0; i < counts.length; i++) {
|
||||
if ( counts[i] > counts[maxI] ) {
|
||||
maxI = i;
|
||||
}
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
private boolean cannotOverlapFutureVC(SAMRecord read, int alignmentStartOfLastRead) {
|
||||
return read.getAlignmentEnd() < alignmentStartOfLastRead - Math.max(SNPContextSize, IndelContextSize);
|
||||
}
|
||||
|
||||
private boolean withinContextOfVariants(SAMRecord read, Collection<VariantContext> vcs) {
|
||||
for ( VariantContext vc : vcs ) {
|
||||
if ( withinContextOfVariant(read, vc) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean withinContextOfVariant(SAMRecord read, VariantContext vc) {
|
||||
if ( ! read.getReferenceName().equals(vc.getChr()) )
|
||||
return false;
|
||||
else if ( vc.isVariant() ) {
|
||||
int contextSize = vc.isSNP() ? SNPContextSize : IndelContextSize;
|
||||
int vcContextStart = vc.getStart() - contextSize;
|
||||
int vcContextEnd = vc.getEnd() + contextSize;
|
||||
boolean notInContext = read.getAlignmentEnd() < vcContextStart || read.getAlignmentStart() > vcContextEnd;
|
||||
return ! notInContext;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
public void close() {
|
||||
// write out all of the remaining reads
|
||||
while ( ! waitingReads.isEmpty() ) { // there's something in the queue
|
||||
finalDestination.addAlignment(waitingReads.remove());
|
||||
}
|
||||
finalDestination.close();
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue