deleting accidentally committed junk
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4464 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b3d81984aa
commit
39e373af6e
|
|
@ -88,7 +88,7 @@ public class VariantContextAdaptors {
|
||||||
return null;
|
return null;
|
||||||
Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
|
Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
|
||||||
|
|
||||||
if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
|
if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
|
||||||
// add the reference allele
|
// add the reference allele
|
||||||
List<Allele> alleles = new ArrayList<Allele>();
|
List<Allele> alleles = new ArrayList<Allele>();
|
||||||
alleles.add(refAllele);
|
alleles.add(refAllele);
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||||
|
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
|
||||||
|
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
|
|
@ -84,6 +87,17 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false)
|
// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false)
|
||||||
// boolean FORMAT_VCF = false;
|
// boolean FORMAT_VCF = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "genotype_intervals", shortName = "genotype",
|
||||||
|
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
|
||||||
|
public String genotypeIntervalsFile = null;
|
||||||
|
|
||||||
|
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
|
||||||
|
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||||
|
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||||
|
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||||
|
"to sort and keep it in memory (increases memory usage!).")
|
||||||
|
protected boolean GENOTYPE_NOT_SORTED = false;
|
||||||
|
|
||||||
@Argument(fullName="somatic", shortName="somatic",
|
@Argument(fullName="somatic", shortName="somatic",
|
||||||
doc="Perform somatic calls; two input alignment files (-I option) must be specified. Calls are performed from the second file (\"tumor\") against the first one (\"normal\").", required=false)
|
doc="Perform somatic calls; two input alignment files (-I option) must be specified. Calls are performed from the second file (\"tumor\") against the first one (\"normal\").", required=false)
|
||||||
boolean call_somatic = false;
|
boolean call_somatic = false;
|
||||||
|
|
@ -167,6 +181,11 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
private SAMRecord lastRead;
|
private SAMRecord lastRead;
|
||||||
private byte[] refBases;
|
private byte[] refBases;
|
||||||
private ReferenceDataSource refData;
|
private ReferenceDataSource refData;
|
||||||
|
private Iterator<GenomeLoc> genotypeIntervals = null;
|
||||||
|
private GenomeLocSortedSet traverseIntervals = null; // these are the traversal intervals passed with -L option (if any)
|
||||||
|
|
||||||
|
// the current interval in the list of intervals, for which we want to do full genotyping
|
||||||
|
private GenomeLoc currentGenotypeInterval = null;
|
||||||
|
|
||||||
// "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"
|
// "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"
|
||||||
|
|
||||||
|
|
@ -272,6 +291,22 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
normalSamples.add(rg.getSample());
|
normalSamples.add(rg.getSample());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ( genotypeIntervalsFile != null ) {
|
||||||
|
|
||||||
|
traverseIntervals = getToolkit().getIntervals();
|
||||||
|
|
||||||
|
if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) {
|
||||||
|
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
|
||||||
|
genotypeIntervals = new IntervalFileMergingIterator(
|
||||||
|
new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
|
||||||
|
} else {
|
||||||
|
// read in the whole list of intervals for cleaning
|
||||||
|
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(
|
||||||
|
IntervalUtils.parseIntervalArguments(Arrays.asList(genotypeIntervalsFile)), IntervalMergingRule.OVERLAPPING_ONLY);
|
||||||
|
genotypeIntervals = locs.iterator();
|
||||||
|
}
|
||||||
|
currentGenotypeInterval = genotypeIntervals.hasNext() ? genotypeIntervals.next() : null;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Aug 3, 2010
|
||||||
|
* Time: 4:10:42 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** A simple utility class that encapsulates information about a single alignment (offset, strand, overlap, mismatch count).
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class AlignmentInfo {
|
||||||
|
private int offset;
|
||||||
|
private int overlap;
|
||||||
|
private int mm ;
|
||||||
|
private Assembly a = null;
|
||||||
|
|
||||||
|
private static int RIDICULOUSLY_LARGE_NUMBER = 1000000000;
|
||||||
|
|
||||||
|
public AlignmentInfo() {
|
||||||
|
offset = 0;
|
||||||
|
mm = RIDICULOUSLY_LARGE_NUMBER;
|
||||||
|
overlap = -1;
|
||||||
|
a = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AlignmentInfo(int mm, int offset, boolean isRc, int overlap, Assembly a) {
|
||||||
|
this.offset = (isRc ? (-offset-1) : offset );
|
||||||
|
this.overlap = overlap;
|
||||||
|
this.mm = mm;
|
||||||
|
this.a = a;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isAligned() { return mm < RIDICULOUSLY_LARGE_NUMBER; }
|
||||||
|
|
||||||
|
public boolean isNegativeStrand() { return offset < 0; }
|
||||||
|
public Assembly getAssembly() { return a; }
|
||||||
|
public int getOffset() { return ( offset < 0 ? (-offset-1) : offset ); }
|
||||||
|
public int getMismatchCount() { return mm; }
|
||||||
|
public int getOverlap() { return overlap; }
|
||||||
|
public double getMismatchRate() { return isAligned() ? ((double)mm)/overlap : 1.0; }
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,111 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
public class AlignmentList implements Iterable<AlignmentInfo> {
|
||||||
|
private int best_mm = 1000000000;
|
||||||
|
private int next_best_mm = 1000000000;
|
||||||
|
private List<AlignmentInfo> als = null;
|
||||||
|
private int next_best_count = 0;
|
||||||
|
private int best_overlap = 0;
|
||||||
|
|
||||||
|
private AlignmentStrategy strategy = null;
|
||||||
|
|
||||||
|
public AlignmentList(AlignmentStrategy s) {
|
||||||
|
this.strategy = s;
|
||||||
|
best_mm = 1000000000;
|
||||||
|
next_best_mm = 1000000000;
|
||||||
|
best_overlap = 0;
|
||||||
|
als = new ArrayList<AlignmentInfo>(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAligned() {
|
||||||
|
return best_mm < 1000000000;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<AlignmentInfo> getAlignments() { return als; }
|
||||||
|
|
||||||
|
public int size() { return als.size(); }
|
||||||
|
|
||||||
|
public Iterator<AlignmentInfo> iterator() { return als.iterator(); }
|
||||||
|
|
||||||
|
// public void tryAdd(int mm, int offset, boolean isRc, int overlap) {
|
||||||
|
// tryAdd(new AlignmentInfo(mm,offset,isRc,overlap));
|
||||||
|
// }
|
||||||
|
|
||||||
|
public void tryAdd(AlignmentInfo ai) {
|
||||||
|
AlignmentStrategy.Action a = strategy.action(ai,this) ;
|
||||||
|
switch ( a ) {
|
||||||
|
case DISCARD: break;
|
||||||
|
case REPLACE_BEST:
|
||||||
|
next_best_mm = best_mm;
|
||||||
|
next_best_count = size();
|
||||||
|
als.clear();
|
||||||
|
als.add(ai);
|
||||||
|
best_mm = ai.getMismatchCount();
|
||||||
|
best_overlap = ai.getOverlap();
|
||||||
|
break;
|
||||||
|
case ADD_BEST:
|
||||||
|
als.add(ai);
|
||||||
|
if ( ai.getMismatchCount() < best_mm ) best_mm = ai.getMismatchCount();
|
||||||
|
if ( ai.getOverlap() > best_overlap) best_overlap = ai.getOverlap();
|
||||||
|
break;
|
||||||
|
case REPLACE_NEXTBEST:
|
||||||
|
next_best_mm = ai.getMismatchCount();
|
||||||
|
next_best_count = 1;
|
||||||
|
break;
|
||||||
|
case ADD_NEXTBEST:
|
||||||
|
next_best_count++;
|
||||||
|
if ( ai.getMismatchCount() < next_best_mm ) next_best_mm = ai.getMismatchCount();
|
||||||
|
break;
|
||||||
|
default: throw new StingException("Unrecognized action requested: "+a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void tryAddAll(AlignmentList al) {
|
||||||
|
for( AlignmentInfo ai : al) {
|
||||||
|
tryAdd(ai);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBestMMCount() { return best_mm; }
|
||||||
|
public int getBestOverlap() { return best_overlap; }
|
||||||
|
public int getBestHitCount() { return als.size() ; }
|
||||||
|
public int getNextBestHitCount() { return next_best_count; }
|
||||||
|
public int getNextBestMMCount() { return next_best_mm; }
|
||||||
|
// public int getOverlap() { return overlap; }
|
||||||
|
// public int getOffset() { return offset; }
|
||||||
|
// public boolean isNegativeStrand() { return rc; }
|
||||||
|
|
||||||
|
// public double getMismatchRate() { return isAligned() ? ((double)best_mm)/overlap : 1.0 ; }
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Aug 3, 2010
|
||||||
|
* Time: 2:53:43 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public interface AlignmentStrategy {
|
||||||
|
enum Action {
|
||||||
|
DISCARD,
|
||||||
|
REPLACE_BEST,
|
||||||
|
ADD_BEST,
|
||||||
|
REPLACE_NEXTBEST,
|
||||||
|
ADD_NEXTBEST
|
||||||
|
};
|
||||||
|
|
||||||
|
public Action action(AlignmentInfo alignment, AlignmentList currentList);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,442 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
|
import net.sf.samtools.util.StringUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Aug 3, 2010
|
||||||
|
* Time: 2:20:22 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class Assembly {
|
||||||
|
private byte[] consensus;
|
||||||
|
private short[] coverage;
|
||||||
|
private short[] mismatches;
|
||||||
|
private short [][] base_counts;
|
||||||
|
|
||||||
|
private boolean debug = false;
|
||||||
|
private List<String> seq_ids;
|
||||||
|
private List<byte []> seqs;
|
||||||
|
private List<Integer> seq_offsets;
|
||||||
|
|
||||||
|
private KmerIndex lookup; // assembled consensus sequence is indexed here
|
||||||
|
|
||||||
|
private int hookedAt = -1; // if set, specifies start on the ref of the assembled consensus sequence
|
||||||
|
|
||||||
|
private static List<PrimitivePair.Int> EMPTY_KMER_LIST = new ArrayList<PrimitivePair.Int>(0);
|
||||||
|
|
||||||
|
private int K = 15;
|
||||||
|
|
||||||
|
private AlignmentStrategy strategy = null;
|
||||||
|
|
||||||
|
/** Creates new assembly seeded with the specified sequence; default key length (15) is used.
|
||||||
|
*
|
||||||
|
* @param seq
|
||||||
|
* @param id
|
||||||
|
*/
|
||||||
|
public Assembly(final byte[] seq, String id) {
|
||||||
|
this(15,seq,id);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates new assembly seeded with the specified sequence and sets kmer (key) length K for the internally maintained
|
||||||
|
* lookup index tables.
|
||||||
|
* @param K
|
||||||
|
* @param seq
|
||||||
|
* @param id
|
||||||
|
*/
|
||||||
|
public Assembly(int K, final byte[] seq, String id) {
|
||||||
|
this.K = K;
|
||||||
|
seq_ids = new ArrayList<String>();
|
||||||
|
seq_offsets = new ArrayList<Integer>();
|
||||||
|
seqs = new ArrayList<byte[]>();
|
||||||
|
seq_ids.add(id);
|
||||||
|
seq_offsets.add(0);
|
||||||
|
seqs.add(seq);
|
||||||
|
consensus = Arrays.copyOf(seq,seq.length);
|
||||||
|
coverage = new short[seq.length];
|
||||||
|
Arrays.fill(coverage,(short)1);
|
||||||
|
mismatches = new short[seq.length]; // filled with 0's
|
||||||
|
base_counts = new short[4][seq.length];
|
||||||
|
for ( int i = 0 ; i < seq.length ; i++ ) {
|
||||||
|
int j = BaseUtils.simpleBaseToBaseIndex(seq[i]);
|
||||||
|
if ( j != -1) base_counts[j][i] = 1;
|
||||||
|
}
|
||||||
|
lookup = new KmerIndex(K,seq);
|
||||||
|
strategy = new DefaultAlignmentStrategy();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates new assembly seeded with the specified sequence; default key length (15) is used and the position on the
|
||||||
|
* reference of the entire assembly is set (as assemblly grows, position on the ref will be updated properly).
|
||||||
|
*
|
||||||
|
* @param seq
|
||||||
|
* @param id
|
||||||
|
*/
|
||||||
|
public Assembly(final byte[] seq, String id, int posOnRef) {
|
||||||
|
this(seq,id);
|
||||||
|
hookedAt = posOnRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates new assembly seeded the specified sequence and sets kmer (key) length K for the internally maintained
|
||||||
|
* lookup index tables. Parameter <code>posOnRef</code> specifies the (initial) position of the entire assembly on the
|
||||||
|
* ref; as the assembly grows, the position on ref will be updated properly.
|
||||||
|
* @param K
|
||||||
|
* @param seq
|
||||||
|
* @param id
|
||||||
|
*/
|
||||||
|
public Assembly(int K, final byte[] seq, String id, int posOnRef) {
|
||||||
|
this(K,seq,id);
|
||||||
|
hookedAt = posOnRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns total number of sequences currently held by this assembly.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int getNumSeqs() { return seqs.size() ; }
|
||||||
|
|
||||||
|
/** Attempts to align <code>seq</code> to this assembly's consensus. Does NOT add
|
||||||
|
* the sequence to the consensus even if it aligns! This methods returns a list of alternative
|
||||||
|
* best alignments found (according to the strategy used) in a newly allocated AlignmentList object.
|
||||||
|
* @param seq sequence to align to this consensus
|
||||||
|
* @param tryRC if true, will try aligning both seq and its reverse complement; otherwise
|
||||||
|
* only forward alignment will be attempted (i.e. best placement of the seq, as it is provided,
|
||||||
|
* along the assembled consensus sequence)
|
||||||
|
* @return a newly allocated alignment list; returned list can be empty if no alignments are found
|
||||||
|
*/
|
||||||
|
public AlignmentList align(final byte[] seq, boolean tryRC) {
|
||||||
|
return align(seq,tryRC,null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Attempts to align <code>seq</code> to this assembly's consensus. Does NOT add
|
||||||
|
* the sequence to the consensus even if it aligns! This method uses existing list of alignments
|
||||||
|
* (which can contain alignments to a different assembly) and updates it as necessary if a better alignment
|
||||||
|
* (or multiple better alignments) than the one(s) already held in the list is found. Reference to the
|
||||||
|
* <i>same</i> alignment list object is returned: this method modifies it's argument. If alignment list argument
|
||||||
|
* is <code>null</code>, new alignment list object will be allocated and returned by this method.
|
||||||
|
*
|
||||||
|
* @param seq sequence to align to this consensus
|
||||||
|
* @param tryRC if true, will try aligning both seq and its reverse complement; otherwise
|
||||||
|
* only forward alignment will be attempted (i.e. best placement of the seq, as it is provided,
|
||||||
|
* along the assembled consensus sequence)
|
||||||
|
* @return a newly allocated alignment list; returned list can be empty if no alignments are found
|
||||||
|
*/
|
||||||
|
public AlignmentList align(final byte[] seq, boolean tryRC, AlignmentList a) {
|
||||||
|
if ( debug ) System.out.println("Assembly:: aligning sequence of length "+seq.length+"; tryRC="+tryRC+"; K="+K);
|
||||||
|
|
||||||
|
List<PrimitivePair.Int> fw_kmers = KmerIndex.toKeyOffsetList(K,seq);
|
||||||
|
|
||||||
|
if ( debug ) {
|
||||||
|
for( PrimitivePair.Int kmer: fw_kmers) {
|
||||||
|
System.out.println("id="+kmer.getFirst()+" seq="+new String(KmerIndex.idToSeq(K,kmer.getFirst()))+" offset on seq="+kmer.getSecond());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
byte [] rc_seq = (tryRC ? BaseUtils.simpleReverseComplement(seq) : null );
|
||||||
|
List<PrimitivePair.Int> rc_kmers = (tryRC ? KmerIndex.toKeyOffsetList(K,rc_seq) : EMPTY_KMER_LIST );
|
||||||
|
|
||||||
|
if ( a == null ) a = new AlignmentList(strategy);
|
||||||
|
|
||||||
|
// i is the position on the sequence seq or on its reverse complement
|
||||||
|
for(PrimitivePair.Int kmer : fw_kmers ) {
|
||||||
|
|
||||||
|
List<Integer> offsets = lookup.getOffsets(kmer.first);
|
||||||
|
if ( offsets != null ) {
|
||||||
|
// kmer present in consensus sequence
|
||||||
|
for ( int s : offsets ) { // s=offset of the current kmer on the assembled consensus
|
||||||
|
int trial_offset = s - kmer.second; // offset of the seq on the assembled consensus suggested by current kmer/offset
|
||||||
|
int trial_mm = countMismatches(seq,trial_offset,a.getNextBestMMCount());
|
||||||
|
a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,false,overlap(trial_offset,seq.length),this));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( PrimitivePair.Int kmer : rc_kmers ) {
|
||||||
|
|
||||||
|
List<Integer> offsets = lookup.getOffsets(kmer.first);
|
||||||
|
if ( offsets != null ) {
|
||||||
|
// kmer present in consensus sequence
|
||||||
|
for ( int s : offsets ) {
|
||||||
|
int trial_offset = s - kmer.second;
|
||||||
|
int trial_mm = countMismatches(rc_seq,trial_offset,a.getNextBestMMCount());
|
||||||
|
a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,true,overlap(trial_offset,seq.length),this));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDebug(boolean d) { this.debug = d; lookup.setDebug(d);}
|
||||||
|
|
||||||
|
public int numSequences() { return seq_ids.size(); }
|
||||||
|
|
||||||
|
private int overlap(int offset, int seq_length ) {
|
||||||
|
return Math.min(consensus.length,offset+seq_length)-Math.max(0,offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int countMismatches(final byte seq[], int offset, int cutoff) {
|
||||||
|
int mm = 0;
|
||||||
|
|
||||||
|
int i ;
|
||||||
|
if ( offset >= 0 ) i = 0;
|
||||||
|
else { i = (-offset); offset = 0; }
|
||||||
|
for ( ; i < seq.length && offset < consensus.length ; i++ , offset++ ) {
|
||||||
|
if ( seq[i] != consensus[offset] ) {
|
||||||
|
mm++;
|
||||||
|
if ( mm > cutoff ) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return mm;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getConsensus() { return consensus; }
|
||||||
|
|
||||||
|
public int getPosOnRef() { return hookedAt; }
|
||||||
|
|
||||||
|
public int getConsensusLength() { return consensus.length; }
|
||||||
|
|
||||||
|
public List<Integer> getOffsets() { return seq_offsets; }
|
||||||
|
public int getOffset(int i) { return seq_offsets.get(i); }
|
||||||
|
|
||||||
|
public List<String> getSeqIds() { return Collections.unmodifiableList(seq_ids); }
|
||||||
|
|
||||||
|
/** Adds specified sequence to this assembly according to the provided
|
||||||
|
* alignment information. Will properly update consensus sequence of this assembly
|
||||||
|
* and all associated information (mismatches, base counts etc)
|
||||||
|
* @param seq
|
||||||
|
* @param id
|
||||||
|
* @param a
|
||||||
|
*/
|
||||||
|
public void add(final byte[] seq, String id, AlignmentInfo a) {
|
||||||
|
|
||||||
|
if ( ! a.isAligned() ) throw new StingException("Can not add sequence to the assembly: provided alignment is empty");
|
||||||
|
|
||||||
|
seq_ids.add(id);
|
||||||
|
|
||||||
|
int offset = a.getOffset();
|
||||||
|
int oldConsensusLength = consensus.length;
|
||||||
|
|
||||||
|
byte [] seq_to_add = ( a.isNegativeStrand() ? BaseUtils.simpleReverseComplement(seq) : seq);
|
||||||
|
|
||||||
|
seqs.add(seq_to_add);
|
||||||
|
|
||||||
|
int pos_on_seq = 0;
|
||||||
|
int pos_on_cons = 0;
|
||||||
|
|
||||||
|
int leftExtension = 0; // how many bases we added to the consensus on the left
|
||||||
|
int rightExtension = 0; // how many bases we added to the consensus on the right
|
||||||
|
|
||||||
|
if ( offset < 0 ) {
|
||||||
|
// if sequence sticks out to the left of the current consensus:
|
||||||
|
|
||||||
|
leftExtension = -offset;
|
||||||
|
for(int i = 0 ; i < seq_offsets.size() ; i++ ) {
|
||||||
|
// we are going to extend consensus to the left, so we need to update all current offsets:
|
||||||
|
seq_offsets.set(i,seq_offsets.get(i)+leftExtension);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( hookedAt > 0 ) hookedAt -= leftExtension;
|
||||||
|
// extend consensus and associated arrays to the left :
|
||||||
|
|
||||||
|
consensus = Utils.extend(consensus,offset,(byte)0); // remember, offset is negative here, extending to the left
|
||||||
|
coverage = Utils.extend(coverage,offset,(short)1) ;
|
||||||
|
mismatches = Utils.extend(mismatches,offset,(short)0);
|
||||||
|
for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],offset,(short)0);
|
||||||
|
|
||||||
|
for ( int j = 0 ; j < -offset ; j++ ) {
|
||||||
|
consensus[j] = seq_to_add[j];
|
||||||
|
int b = BaseUtils.simpleBaseToBaseIndex(seq_to_add[j]);
|
||||||
|
if ( b != -1 ) base_counts[b][j]=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos_on_seq = pos_on_cons = -offset;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
if ( offset > 0 ) pos_on_cons = offset;
|
||||||
|
|
||||||
|
seq_offsets.add(offset);
|
||||||
|
|
||||||
|
boolean consensus_changed = false;
|
||||||
|
|
||||||
|
for ( ; pos_on_seq < seq_to_add.length && pos_on_cons < consensus.length ; pos_on_seq++, pos_on_cons++ ) {
|
||||||
|
coverage[pos_on_cons]++;
|
||||||
|
final byte base = seq_to_add[pos_on_seq];
|
||||||
|
final int b = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
if ( b != -1 ) {
|
||||||
|
// if base on seq is not a regular base, there is nothing to do;
|
||||||
|
// otherwise count mismatches and optionally update consensus if current base tips the balance
|
||||||
|
base_counts[b][pos_on_cons]++;
|
||||||
|
int maxcount = 0;
|
||||||
|
int maxb = -1;
|
||||||
|
for ( int j = 0 ; j < 4 ; j++ ) {
|
||||||
|
if ( base_counts[j][pos_on_cons] > maxcount ) {
|
||||||
|
maxcount = base_counts[j][pos_on_cons];
|
||||||
|
maxb = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// we are guaranteed here that maxb != -1 since we just added one regular base (the current one)
|
||||||
|
// few lines above...
|
||||||
|
byte newbase = BaseUtils.baseIndexToSimpleBase(maxb);
|
||||||
|
if ( newbase != consensus[pos_on_cons] ) { // need to change the consensus base (will recompute mismatches)
|
||||||
|
consensus[pos_on_cons] = newbase;
|
||||||
|
consensus_changed = true;
|
||||||
|
mismatches[pos_on_cons] = 0;
|
||||||
|
for ( int i = 0 ; i < 4 ; i++ ) {
|
||||||
|
if ( i == maxb ) continue;
|
||||||
|
mismatches[pos_on_cons] += base_counts[i][pos_on_cons];
|
||||||
|
}
|
||||||
|
} else { // consensus base did not change; just increment mismatches if current sequence's base differs from consensus
|
||||||
|
if ( base != consensus[pos_on_cons]) mismatches[pos_on_cons]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last step: if sequence sticks out of current consensus on the right, we need to extend the latter:
|
||||||
|
|
||||||
|
if ( pos_on_seq < seq_to_add.length ) {
|
||||||
|
// sequence sticks out of consensus to the right
|
||||||
|
rightExtension = seq_to_add.length - pos_on_seq;
|
||||||
|
consensus = Utils.extend(consensus,rightExtension,(byte)0);
|
||||||
|
coverage = Utils.extend(coverage,rightExtension,(short)1);
|
||||||
|
mismatches = Utils.extend(mismatches,rightExtension,(short)0);
|
||||||
|
for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],rightExtension,(short)0);
|
||||||
|
for ( ; pos_on_seq < seq_to_add.length ; pos_on_seq++, pos_on_cons++ ) {
|
||||||
|
byte base = seq_to_add[pos_on_seq];
|
||||||
|
consensus[pos_on_cons] = base;
|
||||||
|
int b = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
if ( b != -1 ) base_counts[b][pos_on_cons] = base;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// finally, the new sequence we just added could have mismatches that tip some consensus bases into new values;
|
||||||
|
// let's catch those cases:
|
||||||
|
|
||||||
|
|
||||||
|
for ( int i = 0 ; i < consensus.length ; i++ ) {
|
||||||
|
byte cons_base = consensus[i];
|
||||||
|
int b = BaseUtils.simpleBaseToBaseIndex(cons_base);
|
||||||
|
}
|
||||||
|
|
||||||
|
// there is probably a better way, but for now we just recompute the whole lookup table when consensus
|
||||||
|
// changes somewhere in the middle (if we want to be samrt we need to identify just the kmers that changed
|
||||||
|
// and find/change them in the lookup table).
|
||||||
|
if ( consensus_changed ) {
|
||||||
|
lookup.clear();
|
||||||
|
lookup.index(consensus);
|
||||||
|
} else {
|
||||||
|
if ( leftExtension > 0 || rightExtension > 0 ) lookup.updateIndex(consensus,leftExtension,oldConsensusLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toAlignmentString(boolean mismatchesOnly, boolean printNames) {
|
||||||
|
|
||||||
|
int maxNameLength = 0;
|
||||||
|
int spacing=3;
|
||||||
|
|
||||||
|
if ( printNames ) {
|
||||||
|
for ( String n : seq_ids ) if ( n.length() > maxNameLength ) maxNameLength++;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder b = new StringBuilder();
|
||||||
|
if ( printNames ) b.append(Utils.dupString(' ',maxNameLength+spacing));
|
||||||
|
b.append(new String(consensus));
|
||||||
|
b.append('\n');
|
||||||
|
|
||||||
|
for ( int j = 0; j < seqs.size() ; j++ ) {
|
||||||
|
int offset = seq_offsets.get(j);
|
||||||
|
byte [] seq = seqs.get(j);
|
||||||
|
|
||||||
|
if ( printNames ) {
|
||||||
|
b.append(seq_ids.get(j));
|
||||||
|
b.append(Utils.dupString(' ',maxNameLength-seq_ids.get(j).length()+spacing));
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( int i = 0 ; i < offset ; i++ ) b.append(' ');
|
||||||
|
|
||||||
|
for ( int i = 0 ; i < seq.length ; i++ ) {
|
||||||
|
|
||||||
|
byte base = seq[i];
|
||||||
|
if ( mismatchesOnly && base == consensus[i+offset] ) {
|
||||||
|
b.append('.');
|
||||||
|
} else b.append((char)base);
|
||||||
|
}
|
||||||
|
b.append('\n');
|
||||||
|
}
|
||||||
|
return b.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static void testMe(String [] argv ) {
|
||||||
|
byte [] seq1 = "ACGTTGCGTGGTTCACTGCAGTAACTGACTGATGCA".getBytes();
|
||||||
|
byte [] seq2 = "GCGTGGTTTACTGCAGTAACTGACTGATGCAACGTGTTTG".getBytes();
|
||||||
|
byte [] seq3 = "GGNTGACGTTGCGTGGTTTACTGCAGTAACTGACT".getBytes();
|
||||||
|
byte [] seq4 = "NNNTTNCGTGGTTTACTGCAGTAACTGACTGATGCA".getBytes();
|
||||||
|
|
||||||
|
Assembly a = new Assembly(seq1,"1");
|
||||||
|
|
||||||
|
AlignmentList al = a.align(seq2,false);
|
||||||
|
if ( al.isAligned() ) System.out.println("seq 2 aligned");
|
||||||
|
else System.out.println("seq 2 did NOT align");
|
||||||
|
|
||||||
|
if ( al.size() == 1 ) a.add(seq2,"2",al.getAlignments().get(0));
|
||||||
|
else System.out.println("Multiple alignments found for seq 2");
|
||||||
|
|
||||||
|
al = a.align(seq3,false);
|
||||||
|
if ( al.isAligned() ) System.out.println("seq 3 aligned");
|
||||||
|
else System.out.println("seq 3 did NOT align");
|
||||||
|
|
||||||
|
if ( al.size() == 1 ) a.add(seq3,"3",al.getAlignments().get(0));
|
||||||
|
else System.out.println("Multiple alignments found for seq 3");
|
||||||
|
|
||||||
|
al = a.align(seq4,false);
|
||||||
|
if ( al.isAligned() ) System.out.println("seq 4 aligned");
|
||||||
|
else System.out.println("seq 4 did NOT align");
|
||||||
|
|
||||||
|
if ( al.size() == 1 ) a.add(seq4,"4",al.getAlignments().get(0));
|
||||||
|
else System.out.println("Multiple alignments found for seq 4");
|
||||||
|
|
||||||
|
System.out.println(a.toAlignmentString(true, true));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Sep 13, 2010
|
||||||
|
* Time: 5:53:33 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class AssemblyGraph {
|
||||||
|
|
||||||
|
private List<Assembly> sources;
|
||||||
|
|
||||||
|
public AssemblyGraph(Assembly a) {
|
||||||
|
sources = new LinkedList<Assembly>();
|
||||||
|
sources.add(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Initializes assembly from the single specified read, and sets this assembly as the root of this
|
||||||
|
* assembly graph
|
||||||
|
* @param r read; must be aligned, otherwise exception will be thrown
|
||||||
|
* @param K index (Kmer) size of the assembly that will be initialized with the read r
|
||||||
|
*/
|
||||||
|
public AssemblyGraph(SAMRecord r, int K) {
|
||||||
|
if (AlignmentUtils.isReadUnmapped(r))
|
||||||
|
throw new StingException("Can not initialize assembly graph with unaligned read");
|
||||||
|
sources = new LinkedList<Assembly>();
|
||||||
|
sources.add( new Assembly(K,r.getReadBases(),r.getReadName(), r.getAlignmentStart()) );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(SAMRecord r) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Aug 3, 2010
|
||||||
|
* Time: 4:53:01 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class DefaultAlignmentStrategy implements AlignmentStrategy {
|
||||||
|
|
||||||
|
public Action action(AlignmentInfo alignment, AlignmentList currentList) {
|
||||||
|
if ( currentList.size() == 0 ) return Action.REPLACE_BEST;
|
||||||
|
|
||||||
|
if ( alignment.getMismatchCount() > currentList.getNextBestMMCount() ) return Action.DISCARD;
|
||||||
|
|
||||||
|
if ( alignment.getMismatchCount() < currentList.getBestMMCount() ) return Action.REPLACE_BEST;
|
||||||
|
if ( alignment.getMismatchCount() == currentList.getBestMMCount() ) return Action.ADD_BEST;
|
||||||
|
if ( alignment.getMismatchCount() < currentList.getNextBestMMCount() ) return Action.REPLACE_NEXTBEST;
|
||||||
|
if ( alignment.getMismatchCount() == currentList.getNextBestMMCount() ) return Action.ADD_NEXTBEST;
|
||||||
|
|
||||||
|
throw new StingException("Unexpected case found and left unprocessed");
|
||||||
|
// return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,332 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Aug 3, 2010
|
||||||
|
* Time: 1:31:15 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class KmerIndex {
|
||||||
|
private HashMap<Integer,List<Integer> > lookup;
|
||||||
|
private int K = -1;
|
||||||
|
private int mask = 0;
|
||||||
|
private boolean debug = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Translates sequence <code>seq</code> into the list of all valid kmers paired
|
||||||
|
* with their offsets on that sequence. Valid kmer is a kmer that contains only ACGT bases; if <code>seq</code>
|
||||||
|
* contains any other symbols, no kmers overlapping with such symbols will be generated. This method
|
||||||
|
* returns a linear (possibly gapped if non-ACGT symbols are present) representation of the sequence as its kmers
|
||||||
|
* with corresponding offsets, NOT a lookup index. If a specific kmer occurs on the sequence N times,
|
||||||
|
* the returned list will have N occurences of this kmer, each paired with one unique location on the sequence.
|
||||||
|
* Kmers themselves are represented as integer kmer ids here, see #idToSeq() if string (ACGT bases) representation
|
||||||
|
* of kmers is needed. Empty list if returned if no valid kmers are found on the sequence (i.e. too many non-ACGT bases)
|
||||||
|
*
|
||||||
|
* @param K key (kmer) length
|
||||||
|
* @param seq sequence to translate into kmer/offset representation
|
||||||
|
* @return list of kmer/offsets
|
||||||
|
*/
|
||||||
|
public static List<PrimitivePair.Int> toKeyOffsetList(int K, byte seq[]) {
|
||||||
|
return toKeyOffsetList(K,seq,0,seq.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Same as #toKeyOffsetList(int K, byte [] seq) (see docs), except that this method is not static and
|
||||||
|
* uses key length K associated with the specific instance of the KmerIndex class.
|
||||||
|
* @param seq
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<PrimitivePair.Int> toKeyOffsetList(byte [] seq) {
|
||||||
|
return toKeyOffsetList(this.K,seq);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns an ordered sequence of overlapping (1-shift in the ideal case) k-mers of length K found in the subsequence of
|
||||||
|
* length <code>length</code> of sequence <code>seq</code> starting at position <code>start</code>. All returned kmers
|
||||||
|
* are fully subsumed by the interval [start, start+length) on the sequence <code>seq</code> (no partial overlaps).
|
||||||
|
* Each kmer is paired with its offset on the (full length) <code>seq</code> in the returned list.
|
||||||
|
* Note that only k-mers on the forward strand are returned. You need to manually rc the string and
|
||||||
|
* call toKeyOffsetList() again to get rc k-mers. If sequence contains any other symbols than ACGT, all k-mers
|
||||||
|
* that would overlap with those symbols will be skipped (not present in the returned list). See also
|
||||||
|
* #toKeyOffsetList(int K, byte [] seq) which translates the whole sequence <code>seq</code>.
|
||||||
|
*
|
||||||
|
* @param K index key (k-mer) length
|
||||||
|
* @param seq sequence to compute k-mers from
|
||||||
|
* @param start compute kmers for subsequence [start, start+length) of <code>seq</code>
|
||||||
|
* @param length compute kmers for subsequence [start, start+length) of <code>seq</code>
|
||||||
|
* @return a list of pairs (kmer,offset_on_the_seq) for each valid kmer (i.e. kmer that does not overlap with
|
||||||
|
* non-ACGT bases); if no valid kmers exist, the returned list will be empty.
|
||||||
|
*/
|
||||||
|
public static List<PrimitivePair.Int> toKeyOffsetList(int K, byte[] seq, int start, int length) {
|
||||||
|
|
||||||
|
if ( length < K ) throw new StingException("Can not index sequence that is shorter than key length: total seq="+seq.length+"; start="+start+"; length="+length);
|
||||||
|
|
||||||
|
int mask = 0;
|
||||||
|
if ( length > K ) {
|
||||||
|
for ( int i = 0; i < K ; i++ ) {
|
||||||
|
mask <<= 2;
|
||||||
|
mask |= 0x03;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int key = 0;
|
||||||
|
int i ;
|
||||||
|
final int final_pos = start+length; // first base *after* the last position we want to index
|
||||||
|
|
||||||
|
ArrayList<PrimitivePair.Int> l = new ArrayList<PrimitivePair.Int>(length-K+1);
|
||||||
|
|
||||||
|
PrimitivePair.Int firstK = toFirstKey(K,seq,start,final_pos);
|
||||||
|
if ( firstK == null ) {
|
||||||
|
// ooops, too many non-ACGT bases, we were not able to find a single valid k-mer on the whole sequence!
|
||||||
|
return l;
|
||||||
|
}
|
||||||
|
|
||||||
|
l.add(firstK);
|
||||||
|
|
||||||
|
start = firstK.getSecond();
|
||||||
|
i = start + K; // i points to the first base after the returned kmer firstK
|
||||||
|
key = firstK.getFirst();
|
||||||
|
|
||||||
|
// now let's try recomputing next kmers in an efficient way: we reuse previous kmer and add only the new last base.
|
||||||
|
// This will break if we encounter a non-ACGT base, in which case we will have to start over.
|
||||||
|
|
||||||
|
for ( start++ ; i < final_pos ; i++, start++ ) {
|
||||||
|
int d = BaseUtils.simpleBaseToBaseIndex(seq[i]);
|
||||||
|
if ( d == -1 ) {
|
||||||
|
// ooops, we ran into a bad base; let's jump over it completely and reinitialize the key
|
||||||
|
// (since all kmers overlapping with the current base are invalid)
|
||||||
|
firstK = toFirstKey(K,seq,i+1,final_pos);
|
||||||
|
if ( firstK == null ) break; // no more valid kmers
|
||||||
|
l.add(firstK);
|
||||||
|
start = firstK.getSecond();
|
||||||
|
i = start+K; // points to the base right after the Kmer we just found
|
||||||
|
key = firstK.getFirst(); // reset key to the new kmer we just found
|
||||||
|
} else {
|
||||||
|
// the base is good, so we can compute our new kmer very efficiently using the old one:
|
||||||
|
key <<= 2;
|
||||||
|
key &= mask;
|
||||||
|
key += d;
|
||||||
|
l.add(new PrimitivePair.Int(key,start));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return l;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Non-static version of #toKeyOffsetList(int K, byte [] seq, int start, int length) (see docs), which
|
||||||
|
* uses key length K associated with this instance of the KmerIndex object.
|
||||||
|
* @param seq
|
||||||
|
* @param start
|
||||||
|
* @param length
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<PrimitivePair.Int> toKeyOffsetList(byte[] seq, int start, int length) {
|
||||||
|
return toKeyOffsetList(this.K,seq,start,length);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Computes index (key) of the first valid kmer in the interval [start,stop) of the sequence seq. Kmer is valid
|
||||||
|
* if it contains only valid (ACGT) bases. Returns key and actual offset of first such kmer found, or null
|
||||||
|
* if such kmer does not exist (i.e. if seq does not contain a continuous span of ACGT bases at least K bases long).
|
||||||
|
* @param K
|
||||||
|
* @param seq
|
||||||
|
* @param start
|
||||||
|
* @param stop
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private static PrimitivePair.Int toFirstKey(int K, byte[] seq, int start, int stop) {
|
||||||
|
int d = -1;
|
||||||
|
int key = 0 ;
|
||||||
|
while ( d == -1 && start < stop - K + 1) {
|
||||||
|
key = 0;
|
||||||
|
for ( int i = start ; i < start+K; i++ ) {
|
||||||
|
key <<= 2;
|
||||||
|
d = BaseUtils.simpleBaseToBaseIndex(seq[i]);
|
||||||
|
if ( d == -1) {
|
||||||
|
// ooops, non-ACGT base found, abort and start over. Next kmer that
|
||||||
|
// have a chance to be valid (contain only ACGT bases) can start only after the current position:
|
||||||
|
start = i+1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
key += d;
|
||||||
|
}
|
||||||
|
} // got the first key
|
||||||
|
|
||||||
|
if ( d != -1 ) return new PrimitivePair.Int(key,start);
|
||||||
|
else return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates an empty kmer index table with specified key length
|
||||||
|
*
|
||||||
|
* @param K
|
||||||
|
*/
|
||||||
|
public KmerIndex(final int K) {
|
||||||
|
if ( K > 16 ) throw new StingException("Lookup keys longer than 16 bases are currently not supported");
|
||||||
|
if ( K % 2 == 0 ) throw new StingException("Even keys require additional processing of palindromes, currently not supported. Please use odd key.");
|
||||||
|
this.K = K;
|
||||||
|
|
||||||
|
mask = 0;
|
||||||
|
for ( int i = 0; i < K; i++ ) {
|
||||||
|
mask <<= 2;
|
||||||
|
mask |= 0x03;
|
||||||
|
} // got the first key
|
||||||
|
|
||||||
|
lookup = new HashMap<Integer,List<Integer>>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Builds kmer index table with key length <code>K</code> for the sequence <code>seq</code>.
|
||||||
|
*
|
||||||
|
* @param K
|
||||||
|
* @param seq
|
||||||
|
*/
|
||||||
|
public KmerIndex(final int K, final byte[] seq) {
|
||||||
|
this(K);
|
||||||
|
|
||||||
|
if ( seq.length < K ) throw new StingException("Sequence is shorter than requested lookup index key length");
|
||||||
|
|
||||||
|
addToIndex(toKeyOffsetList(K,seq,0,seq.length));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDebug(boolean d) { this.debug = d; }
|
||||||
|
|
||||||
|
/** Clears current lookup index table completely (but preserves the key length previously set).
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public void clear() { lookup.clear(); }
|
||||||
|
|
||||||
|
/** Builds complete index for the sequence seq. This method can be used only when lookup table is
|
||||||
|
* empty (i.e. use #clear() first), otherwise an exception will be thrown.
|
||||||
|
* @param seq
|
||||||
|
*/
|
||||||
|
public void index(final byte[] seq) {
|
||||||
|
if ( ! lookup.isEmpty() ) {
|
||||||
|
throw new StingException("Can not index new sequence: lookup table is already non-empty");
|
||||||
|
}
|
||||||
|
addToIndex(toKeyOffsetList(K,seq,0,seq.length));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates existing index. It is assumed that the sequence that was already indexed by this KmerIndex object is
|
||||||
|
* the exact subsequence of length <code>old_length</code> of the new sequence <code>seq</code>, starting at
|
||||||
|
* position <code>old_start</code>. No checks are performed, so it is the responsibility of the caller to ensure
|
||||||
|
* that this is indeed the case, otherwise the index will be inconsistent. Since the old sequence is a part
|
||||||
|
* of the new one, this method will keep all the already computed kmers (and update their offsets as needed),
|
||||||
|
* and compute and add kmers/offsets for all the novel bases added to the sequence <code>seq</code> compared
|
||||||
|
* to the old, already indexed subsequnce. If <code>old_length</code> is less than
|
||||||
|
* K (i.e. old sequence could not be and was not indexed at all), the new sequence <code>seq</code> will
|
||||||
|
* be fully indexed from start to end.
|
||||||
|
* @param seq
|
||||||
|
* @param old_start already indexed subsequence starts at this position in <code>seq</code>
|
||||||
|
* @param old_length length of the already indexed subsequence
|
||||||
|
*/
|
||||||
|
public void updateIndex(final byte[] seq, final int old_start, final int old_length) {
|
||||||
|
|
||||||
|
if ( old_length < K ) {
|
||||||
|
if ( ! lookup.isEmpty())
|
||||||
|
throw new StingException("It is claimed that old indexed sequence is shorter than K (i.e. it could not be indexed), but index is non empty");
|
||||||
|
addToIndex( toKeyOffsetList(K,seq,0,seq.length));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( old_start > 0 ) {
|
||||||
|
// update positions of previously indexed k-mers:
|
||||||
|
for ( Map.Entry<Integer,List<Integer>> e : lookup.entrySet() ) {
|
||||||
|
List<Integer> l = e.getValue();
|
||||||
|
for ( int i = 0 ; i < l.size(); i++ ) l.set(i,l.get(i)+old_start);
|
||||||
|
}
|
||||||
|
// take care of additional k-mers appearing *before* the already indexed subsequence:
|
||||||
|
// if already indexed subsequence starts at 'start', the first k-mer from that sequence
|
||||||
|
// ends at start+K-1 (inclusive) and it is obviously already indexed. So the last k-mer we want to index now ends at
|
||||||
|
// start+K-2 (inclusive), the length of [0,start+K-2] interval that we need to index is
|
||||||
|
// start+K-1.
|
||||||
|
addToIndex( toKeyOffsetList(K,seq,0,old_start+K-1) );
|
||||||
|
}
|
||||||
|
|
||||||
|
// the last k-mer we already indexed ends at start+length-1 (inclusive); so it starts at start+length-1-(K-1)=start+length-K.
|
||||||
|
// Hence, the first k-mer that is not indexed yet starts at start+length-K+1. The length of the subsequence that
|
||||||
|
// we need to index, [start+length-K+1,seq.length) is seq.length - start - length +K - 1
|
||||||
|
|
||||||
|
int pos = old_start+old_length-K+1;
|
||||||
|
addToIndex( toKeyOffsetList(K,seq,pos,seq.length-pos) );
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Convenience shortcut: takes the list of keys/offsets and pushes offsets into the lookup index for the keys that
|
||||||
|
* do exist already, or first creates the new entry and then pushes the offset for keys that are novel. This method
|
||||||
|
* is quiet: if <code>keys</code> is <code>null</code> or an empty list, it does nothing.
|
||||||
|
* @param keys
|
||||||
|
*/
|
||||||
|
private void addToIndex(final List<PrimitivePair.Int> keys ) {
|
||||||
|
if ( keys == null ) return;
|
||||||
|
for ( PrimitivePair.Int key: keys ) {
|
||||||
|
List<Integer> l = lookup.get(key.getFirst());
|
||||||
|
if ( l == null ) {
|
||||||
|
l = new ArrayList<Integer>();
|
||||||
|
lookup.put(key.getFirst(),l);
|
||||||
|
}
|
||||||
|
l.add(key.getSecond());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts kmer (integer key) of length K into its sequence representation. Returns a sequence (over ACGT alphabet)
|
||||||
|
* of length K that corresponds to the specified key.
|
||||||
|
* @param K
|
||||||
|
* @param kmer
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static byte [] idToSeq(int K, int kmer) {
|
||||||
|
byte [] seq = new byte[K];
|
||||||
|
for ( int i = K-1; i >=0 ; i-- ) {
|
||||||
|
seq[i] = BaseUtils.baseIndexToSimpleBase(kmer & 0x3);
|
||||||
|
kmer >>= 2;
|
||||||
|
}
|
||||||
|
return seq;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns all offsets for the specified kmer (key) on the sequence indexed by this KmerIndex object. Returns
|
||||||
|
* null if specified kmer is not present on the indexed sequence.
|
||||||
|
* @param key
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<Integer> getOffsets(int key) { return lookup.get(key); }
|
||||||
|
// public List<Integer> getOffsets(byte[] seq) {
|
||||||
|
// if ( seq.length != K ) throw new StingException("Can not perform direct lookup of a sequence with length different from key size");
|
||||||
|
//
|
||||||
|
// return getOffsets( toKey(seq) ) ;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,401 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.utils;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Aug 6, 2010
|
||||||
|
* Time: 6:18:01 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ReadPair {
|
||||||
|
|
||||||
|
public enum PairType {
|
||||||
|
UNKNOWN,
|
||||||
|
BOTH_UNMAPPED,
|
||||||
|
ONE_UNMAPPED,
|
||||||
|
PROPER,
|
||||||
|
LEFT,
|
||||||
|
RIGHT,
|
||||||
|
OUTER,
|
||||||
|
INTER
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
private SAMRecord end1 = null;
|
||||||
|
private SAMRecord end2 = null;
|
||||||
|
private PairType pType = PairType.UNKNOWN;
|
||||||
|
private int leftStart = -1;
|
||||||
|
private int rightStart = -1;
|
||||||
|
private SAMRecord leftRead = null;
|
||||||
|
private SAMRecord rightRead = null;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/** Creates an empty read pair object */
|
||||||
|
public ReadPair() {}
|
||||||
|
|
||||||
|
/** Creates a read pair objects initialized with the specified read */
|
||||||
|
public ReadPair(SAMRecord read) {
|
||||||
|
addRead(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns name of the paired read (it is assumed that both individual reads in the pair share same name).
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getName() { return ( end1 != null ? end1.getReadName() : (end2 != null ? end2.getReadName() : null) ); }
|
||||||
|
|
||||||
|
/** Returns true if both ends are recorded in this read pair object. Note that because SAM records carry
|
||||||
|
* mate information, a pair can be (partially) initialized from one end. This method verifies that this is not the case
|
||||||
|
* and both records are actually present.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean hasBothEnds() { return end1 != null && end2 != null ; }
|
||||||
|
|
||||||
|
/** Returns true if this pair object was initialized with at least one end. Since SAM records carry mate information,
|
||||||
|
* it is sometimes sufficient to have only one read (fragment end) actually recorded in the pair object, at which
|
||||||
|
* point some useful information can be retrieved for the pair already.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean hasAnyData() { return end1 != null || end2 != null ; }
|
||||||
|
|
||||||
|
/** Returns true if both ends in the pair are mapped. The pair object must be at least partially initialized (i.e.
|
||||||
|
* it has to hold a reference to at least one end of the pair), otherwise an exception will be thrown.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean bothEndsMapped() {
|
||||||
|
if ( pType == PairType.UNKNOWN ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
|
||||||
|
|
||||||
|
if ( pType == PairType.BOTH_UNMAPPED || pType == PairType.ONE_UNMAPPED ) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if both ends in the pair are mapped uniquely. This method requires both ends being already registered
|
||||||
|
* in this pair object (i.e. hasBothEnds() is true), otherwise an exception will be thrown.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean bothEndsUniquelyMapped() {
|
||||||
|
if ( ! hasBothEnds() ) throw new StingException("Can not determine if both ends are uniquely mapped until both ends are recorded");
|
||||||
|
return bothEndsMapped() && end1.getMappingQuality() > 0 && end2.getMappingQuality() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if this pair is in proper orientation, i.e. ---> <--- on the same contig */
|
||||||
|
public boolean isProper() { return pType == PairType.PROPER; }
|
||||||
|
|
||||||
|
/* Returns true if this pair is in outer orientation, i.e. <--- ---> on the same chromosome */
|
||||||
|
public boolean isOuter() { return pType == PairType.OUTER; }
|
||||||
|
|
||||||
|
/** Returns left (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map
|
||||||
|
* onto the same contig, otherwise an exception will be thrown.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public SAMRecord getLeftRead() {
|
||||||
|
if ( ! bothEndsMapped() || pType == PairType.INTER )
|
||||||
|
throw new StingException("Left read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName());
|
||||||
|
if ( leftRead == null )
|
||||||
|
throw new StingException("Left read is not recorded. Maybe we have not seen it yet? Pair: "+getName());
|
||||||
|
return leftRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns right (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map
|
||||||
|
* onto the same contig, otherwise an exception will be thrown.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public SAMRecord getRightRead() {
|
||||||
|
if ( ! bothEndsMapped() || pType == PairType.INTER )
|
||||||
|
throw new StingException("Right read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName());
|
||||||
|
if ( rightRead == null )
|
||||||
|
throw new StingException("Right read is not recorded. Maybe we have not seen it yet? Pair: "+getName());
|
||||||
|
return rightRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord getEnd1() { return end1; }
|
||||||
|
public SAMRecord getEnd2() { return end2; }
|
||||||
|
|
||||||
|
public PairType getPairType() { return pType ; }
|
||||||
|
|
||||||
|
public void addRead(SAMRecord r) {
|
||||||
|
if ( ! r.getReadPairedFlag() ) throw new StingException("Read "+r.getReadName() +" is unpaired");
|
||||||
|
if ( r.getFirstOfPairFlag() ) {
|
||||||
|
if ( end1 != null ) throw new StingException("Read "+r.getReadName()+" is first of pair and the pair already has first read recorded");
|
||||||
|
end1 = r;
|
||||||
|
if ( end2 != null && ! end1.getReadName().equals(end2.getReadName()) )
|
||||||
|
throw new StingException("The pair already has read "+end2.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" );
|
||||||
|
} else {
|
||||||
|
if ( r.getSecondOfPairFlag() ) {
|
||||||
|
if ( end2 != null ) throw new StingException("Read "+r.getReadName()+" is second of pair and the pair already has second read recorded");
|
||||||
|
end2 = r;
|
||||||
|
if ( end1 != null && ! end1.getReadName().equals(end2.getReadName()) )
|
||||||
|
throw new StingException("The pair already has read "+end1.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" );
|
||||||
|
} else {
|
||||||
|
throw new StingException("The read "+r.getReadName()+" is marked as paired, but the first/second of pair flag is not set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setPairInfo(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** If pair type has not been set yet, then sets it to <code>t</code>. Otherwise (pair type already set),
|
||||||
|
* just checks if the pair type is <code>t</t>. If it is, the method returns quietly; if it is not (inconsistency detected),
|
||||||
|
* throws an exception.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private void setCheckPairType(PairType t) {
|
||||||
|
if ( pType != PairType.UNKNOWN ) {
|
||||||
|
if ( pType != t )
|
||||||
|
throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
|
||||||
|
} else pType = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setCheckLeftStart(int pos) {
|
||||||
|
if ( leftStart >= 0 ) {
|
||||||
|
if ( leftStart != pos )
|
||||||
|
throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
|
||||||
|
} else leftStart = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setCheckRightStart(int pos) {
|
||||||
|
if ( rightStart >= 0 ) {
|
||||||
|
if ( rightStart != pos )
|
||||||
|
throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
|
||||||
|
} else rightStart = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setPairInfo(SAMRecord read) {
|
||||||
|
|
||||||
|
setCheckPairType(getPairType(read));
|
||||||
|
|
||||||
|
// there is nothing left to do unless both ends are mapped onto the same contig:
|
||||||
|
if ( pType == PairType.INTER ) return;
|
||||||
|
|
||||||
|
if ( pType == PairType.ONE_UNMAPPED ) {
|
||||||
|
// set putative left or right read depending on the orientation of the only mapped mate
|
||||||
|
if ( ! AlignmentUtils.isReadUnmapped(read ) ) {
|
||||||
|
// we can set left/right read only if it is the current read that is mapped; if we have the
|
||||||
|
// unmapped mate, skip and wait for the mapped read to come!
|
||||||
|
if ( read.getReadNegativeStrandFlag() ) {
|
||||||
|
setCheckRightStart(read.getAlignmentStart());
|
||||||
|
if ( rightRead != null ) throw new StingException("Right read was already set for the pair");
|
||||||
|
rightRead = read;
|
||||||
|
} else {
|
||||||
|
setCheckLeftStart(read.getAlignmentStart());
|
||||||
|
if ( leftRead != null ) throw new StingException("Left read was already set for the pair");
|
||||||
|
leftRead = read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we are here if both ends are mapped and they map onto the same contig
|
||||||
|
if ( read.getAlignmentStart() < read.getMateAlignmentStart() ) { //left/right = read/mate
|
||||||
|
|
||||||
|
setCheckLeftStart(read.getAlignmentStart());
|
||||||
|
setCheckRightStart(read.getMateAlignmentStart());
|
||||||
|
|
||||||
|
if ( leftRead != null ) throw new StingException("Left read was already set for the pair");
|
||||||
|
leftRead = read;
|
||||||
|
} else {
|
||||||
|
// left/right = mate/read
|
||||||
|
|
||||||
|
setCheckLeftStart(read.getMateAlignmentStart());
|
||||||
|
setCheckRightStart(read.getAlignmentStart());
|
||||||
|
|
||||||
|
if ( rightRead != null ) throw new StingException("Right read was already set for the pair");
|
||||||
|
rightRead = read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns pair type that describes this read and its mate. The alignment information for both the read itself
|
||||||
|
* and its mate is taken from the read's sam record passed as the argument, so the mate information is expected to be
|
||||||
|
* correctly set!
|
||||||
|
* @param read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static PairType getPairType(SAMRecord read) {
|
||||||
|
|
||||||
|
if ( AlignmentUtils.isReadUnmapped(read) ) {
|
||||||
|
if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.BOTH_UNMAPPED;
|
||||||
|
else return PairType.ONE_UNMAPPED;
|
||||||
|
}
|
||||||
|
|
||||||
|
return getWouldBePairType(read,read.getReferenceIndex(),read.getAlignmentStart(),read.getReadNegativeStrandFlag());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns pair type that would describe this read and its mate, if this read mapped onto refId:start in orientation
|
||||||
|
* given by rc (forward is rc=false, reverse is rc=true). The read's alignment information (if any,
|
||||||
|
* unmapped reads are allowed) present in the SAM record is completely ignored by this method,
|
||||||
|
* only mate's information is used.
|
||||||
|
* @param read
|
||||||
|
* @param refId
|
||||||
|
* @param start
|
||||||
|
* @param rc
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static PairType getWouldBePairType(SAMRecord read, int refId, int start, boolean rc) {
|
||||||
|
|
||||||
|
|
||||||
|
if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.ONE_UNMAPPED ;
|
||||||
|
|
||||||
|
// both read and mate are mapped:
|
||||||
|
|
||||||
|
if ( refId != read.getMateReferenceIndex() ) return PairType.INTER;
|
||||||
|
|
||||||
|
// both read and its mate map onto the same chromosome
|
||||||
|
|
||||||
|
if ( start < read.getMateAlignmentStart() ) { //left/right = read/mate
|
||||||
|
|
||||||
|
if ( rc ) {
|
||||||
|
if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT;
|
||||||
|
else return PairType.OUTER;
|
||||||
|
} else {
|
||||||
|
if ( read.getMateNegativeStrandFlag() ) return PairType.PROPER;
|
||||||
|
else return PairType.RIGHT;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// left/right = mate/read
|
||||||
|
|
||||||
|
if ( rc ) {
|
||||||
|
if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT;
|
||||||
|
else return PairType.PROPER;
|
||||||
|
} else {
|
||||||
|
if ( read.getMateNegativeStrandFlag() ) return PairType.OUTER;
|
||||||
|
else return PairType.RIGHT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLeftStart() {
|
||||||
|
if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
|
||||||
|
return leftStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRightStart() {
|
||||||
|
if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
|
||||||
|
return rightStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getFragmentSize() {
|
||||||
|
if ( ! hasBothEnds() ) throw new StingException("Can not determine fragment size: pair object does not have both ends yet");
|
||||||
|
if ( ! bothEndsMapped() ) throw new StingException("Can not determine fragment size: both ends must be mapped");
|
||||||
|
if ( pType != PairType.PROPER ) throw new StingException("The pais is not in proper orientation, can not determine fragment size");
|
||||||
|
|
||||||
|
return getFragmentSize(leftRead,rightRead);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Given a read (that must belong to this pair), returns the other end in the pair if it is already
|
||||||
|
* recorded, or null otherwise.
|
||||||
|
* @param read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public SAMRecord getOtherEnd(SAMRecord read) {
|
||||||
|
if ( read.getFirstOfPairFlag() ) return end2;
|
||||||
|
else {
|
||||||
|
if ( read.getSecondOfPairFlag() ) return end1;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int getFragmentSize(SAMRecord left, SAMRecord right) {
|
||||||
|
|
||||||
|
if ( left == null || right == null ||
|
||||||
|
AlignmentUtils.isReadUnmapped(left) || AlignmentUtils.isReadUnmapped(right) ) {
|
||||||
|
throw new StingException("No read (null) or unmapped read provided: fragment size is not defined");
|
||||||
|
}
|
||||||
|
if ( left.getReferenceIndex() != right.getReferenceIndex() ) {
|
||||||
|
throw new StingException("Left/right reads map onto different contigs: fragment size is not defined");
|
||||||
|
}
|
||||||
|
|
||||||
|
int fragment_length = left.getReadLength(); // fragment is at least as long as the left read, duh!
|
||||||
|
int leftEnd = left.getAlignmentEnd();
|
||||||
|
int rightStart = right.getAlignmentStart();
|
||||||
|
|
||||||
|
if ( rightStart > leftEnd ) {
|
||||||
|
// if reads are not overlapping, fragment length is lengths of both reads plus the distance (gap) between
|
||||||
|
// the reads. Note that if the sequence between the reads happens to have insirtions or deletions,
|
||||||
|
// our estimation of the actual distance between the reads (on the fragment) is incorrect, but we
|
||||||
|
// can not do better given just those reads. This estimation is, in particular, incorrect
|
||||||
|
// for left reads ending with 'I' and/or right reads starting with 'I'
|
||||||
|
//
|
||||||
|
// left right
|
||||||
|
// -------->...gap...<-------- fragment = left+gap+right
|
||||||
|
|
||||||
|
return left.getReadLength() + right.getReadLength() + (rightStart - leftEnd-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we are here, the reads do overlap; fragment length is lengths of the two reads less the overlap.
|
||||||
|
// in this case we can compute the actual overlap between the reads (on the fragment) taking into
|
||||||
|
// account indels, if any
|
||||||
|
//
|
||||||
|
// left **** right
|
||||||
|
// ------------> ****=overlap; fragment = left+right - overlap
|
||||||
|
// <--------------
|
||||||
|
//
|
||||||
|
// with deletion:
|
||||||
|
//
|
||||||
|
// left ** ** right
|
||||||
|
// -----------ddd-> ****=overlap; fragment = left+right - overlap
|
||||||
|
// <-ddd------------- note that overlap != leftEnd - rightStart+1
|
||||||
|
// instead, overlap = leftEnd-rightStart+1- length(D)
|
||||||
|
// with insertion:
|
||||||
|
//
|
||||||
|
// left ******* right ******* = overlap; fragment = left+right - overlap
|
||||||
|
// -------------iii-> note that overlap != leftEnd - rightStart +1
|
||||||
|
// <-iii-------------- instead, overlap = leftEnd - rightStart +1 + length(I)
|
||||||
|
// (since 'i' bases are NOT on the ref)
|
||||||
|
|
||||||
|
int posOnRef = rightStart;
|
||||||
|
// int posOnRightRead = 0;
|
||||||
|
|
||||||
|
int overlap = leftEnd - rightStart + 1 ;
|
||||||
|
|
||||||
|
for(CigarElement ce : left.getCigar().getCigarElements() ) {
|
||||||
|
switch(ce.getOperator()) {
|
||||||
|
case S:
|
||||||
|
case H:
|
||||||
|
// posOnRightRead+=ce.getLength();
|
||||||
|
break;
|
||||||
|
case I:
|
||||||
|
overlap += ce.getLength();
|
||||||
|
break;
|
||||||
|
case D:
|
||||||
|
case N:
|
||||||
|
overlap -= ce.getLength();
|
||||||
|
case M:
|
||||||
|
posOnRef += ce.getLength();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
if ( posOnRef > leftEnd ) break; // we need to examine only overlapping part of the reads
|
||||||
|
}
|
||||||
|
return left.getReadLength() + right.getReadLength() - overlap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,195 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.utils.ReadPair;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.utils.AlignmentInfo;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.utils.Assembly;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Sep 13, 2010
|
||||||
|
* Time: 12:45:42 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
@WalkerName("DetectWGA")
|
||||||
|
@Requires(value={DataSource.REFERENCE_BASES})
|
||||||
|
|
||||||
|
public class DetectWGAWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
|
private int TIP_LENGTH = 10;
|
||||||
|
private int TIP_MM_THRESHOLD = 1;
|
||||||
|
private double TIP_AV_QUAL_THRESHOLD = 15.0;
|
||||||
|
|
||||||
|
private boolean DEBUG = true;
|
||||||
|
|
||||||
|
Map<String, ReadPair> pairCache = null;
|
||||||
|
Map<String,MathUtils.RunningAverage> fragmentSizeMap = null; // by library
|
||||||
|
private ReferenceDataSource refData;
|
||||||
|
private byte[] refBases;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
|
||||||
|
pairCache = new HashMap<String, ReadPair>();
|
||||||
|
fragmentSizeMap = new HashMap<String,MathUtils.RunningAverage>();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
|
||||||
|
if ( ! read.getReadPairedFlag() ) return 0; // for now!!
|
||||||
|
|
||||||
|
// read is paired
|
||||||
|
|
||||||
|
cacheReadAsPair(read);
|
||||||
|
|
||||||
|
/*
|
||||||
|
// if the read is already mapped (uniquely), we check if it may have the "colored tip" artifact on either side:
|
||||||
|
if ( AlignmentUtils.isReadUniquelyMapped(read) ) {
|
||||||
|
|
||||||
|
TipInfo tips = countTipMismatches(read,TIP_LENGTH);
|
||||||
|
|
||||||
|
if ( tips.leftMM() >= TIP_MM_THRESHOLD || tips.rightMM() >= TIP_MM_THRESHOLD ) {
|
||||||
|
if ( DEBUG ) {
|
||||||
|
out.println(" Read "+read.getReadName()+ " has "+tips.leftMM()+"/"+tips.rightMM()+" mismatches in the tips");
|
||||||
|
out.println(" Pair orientation: "+pair.getPairType());
|
||||||
|
}
|
||||||
|
// try adding read to existing assemblies:
|
||||||
|
AlignmentInfo al = alignToAllAddToBest(read,Math.min(3,tips.leftMM()+tips.rightMM())-1);
|
||||||
|
if ( al == null ) {
|
||||||
|
if ( tips.leftMM() >= TIP_MM_THRESHOLD && tips.leftQ() >= TIP_AV_QUAL_THRESHOLD ||
|
||||||
|
tips.rightMM() >= TIP_MM_THRESHOLD && tips.rightQ() >= TIP_AV_QUAL_THRESHOLD ) {
|
||||||
|
if ( DEBUG ) out.println(" Initialized new assembly.") ;
|
||||||
|
Assembly a = new Assembly(read.getReadBases(),read.getReadName(),read.getAlignmentStart());
|
||||||
|
tryAndAddUnmapped(a); // see if we got unmapped reads that would align nicely
|
||||||
|
assemblies.add(a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide an initial value for reduce computations.
|
||||||
|
*
|
||||||
|
* @return Initial value of reduce.
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return accumulator with result of the map taken into account.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/** little helper: if we already cached the pair object for this read, just add the read to that object; if we did not - instantiate
|
||||||
|
* new pair objetc first and register it in the map, then add the read; this method also updates other cache(s)/trackers as needed, e.g.
|
||||||
|
* fragment size map
|
||||||
|
*/
|
||||||
|
private void cacheReadAsPair(SAMRecord read) {
|
||||||
|
ReadPair pair = pairCache.get( read.getReadName() );
|
||||||
|
if ( pair == null ) {
|
||||||
|
pair = new ReadPair(read);
|
||||||
|
pairCache.put(read.getReadName(),pair);
|
||||||
|
}
|
||||||
|
|
||||||
|
pair.addRead(read);
|
||||||
|
|
||||||
|
// if it's a good pair, add its fragment size to the stats:
|
||||||
|
if ( pair.hasBothEnds() && pair.bothEndsMapped() && pair.isProper() ) {
|
||||||
|
String lib = read.getReadGroup().getLibrary();
|
||||||
|
MathUtils.RunningAverage fSize = fragmentSizeMap.get(lib);
|
||||||
|
if ( fSize == null ) {
|
||||||
|
fSize = new MathUtils.RunningAverage();
|
||||||
|
fragmentSizeMap.put(lib,fSize);
|
||||||
|
}
|
||||||
|
fSize.add(pair.getFragmentSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private TipInfo countTipMismatches(SAMRecord read, int tip_length) {
|
||||||
|
|
||||||
|
AlignmentUtils.MismatchCount left_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,0,tip_length);
|
||||||
|
|
||||||
|
int right_start = read.getReadLength()-tip_length;
|
||||||
|
AlignmentUtils.MismatchCount right_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,right_start,read.getReadLength()-right_start);
|
||||||
|
|
||||||
|
return new TipInfo(left_mm,right_mm);
|
||||||
|
}
|
||||||
|
|
||||||
|
class TipInfo {
|
||||||
|
AlignmentUtils.MismatchCount left_mm;
|
||||||
|
AlignmentUtils.MismatchCount right_mm;
|
||||||
|
double left_avQ;
|
||||||
|
double right_avQ;
|
||||||
|
|
||||||
|
public TipInfo(AlignmentUtils.MismatchCount l,AlignmentUtils.MismatchCount r) {
|
||||||
|
left_mm = l;
|
||||||
|
right_mm = r;
|
||||||
|
left_avQ = (l.numMismatches ==0 ? 0 : ((double)l.mismatchQualities)/l.numMismatches );
|
||||||
|
right_avQ = (r.numMismatches ==0 ? 0 : ((double)r.mismatchQualities)/r.numMismatches );
|
||||||
|
}
|
||||||
|
|
||||||
|
public int leftMM() { return left_mm.numMismatches; }
|
||||||
|
public int rightMM() { return right_mm.numMismatches; }
|
||||||
|
public double leftQ() { return left_avQ; }
|
||||||
|
public double rightQ() { return right_avQ; }
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -363,7 +363,13 @@ public class PairMaker extends CommandLineProgram {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ( best.size() == 0 ) return null; // no unique alignment
|
if ( best.size() == 0 ) return null; // no unique alignment
|
||||||
if ( best.size() > 1 ) throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
|
if ( best.size() > 1 ) {
|
||||||
|
for ( SAMRecord r : best ) {
|
||||||
|
System.out.println("READ "+r.getReadName()+" mapQ="+r.getMappingQuality()+" at="+r.getReferenceName()+
|
||||||
|
":"+r.getAlignmentStart()+"("+(r.getReadNegativeStrandFlag()?"-":"+")+") cig="+r.getCigarString());
|
||||||
|
}
|
||||||
|
throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
|
||||||
|
}
|
||||||
|
|
||||||
return best.get(0);
|
return best.get(0);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,171 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.interval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Oct 7, 2010
|
||||||
|
* Time: 2:40:02 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** This class provides an adapter to Iterator<GenomeLoc> that returns only (parts of) underlying iterator's
|
||||||
|
* intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return
|
||||||
|
* NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by
|
||||||
|
* another interval iterator, it should return sorted and NON-overlapping intervals.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class OverlappingIntervalIterator implements Iterator<GenomeLoc> {
|
||||||
|
PushbackIterator<GenomeLoc> iter = null;
|
||||||
|
PushbackIterator<GenomeLoc> boundBy = null;
|
||||||
|
|
||||||
|
GenomeLoc prefetchedOverlap = null;
|
||||||
|
GenomeLoc currentBound = null;
|
||||||
|
GenomeLoc currentInterval = null;
|
||||||
|
|
||||||
|
/** Creates new overlapping iterator that will internally traverse <code>intervals</code> and return only
|
||||||
|
* overlaps of those with set of intervals returned by <code>boundBy</code>.
|
||||||
|
* @param intervals
|
||||||
|
* @param boundBy
|
||||||
|
*/
|
||||||
|
public OverlappingIntervalIterator(Iterator<GenomeLoc> intervals, Iterator<GenomeLoc> boundBy) {
|
||||||
|
this.iter = new PushbackIterator(intervals);
|
||||||
|
this.boundBy = new PushbackIterator(boundBy);
|
||||||
|
|
||||||
|
if ( iter.hasNext() && boundBy.hasNext() ) {
|
||||||
|
GenomeLoc currentInterval = iter.next(); // load first interval
|
||||||
|
GenomeLoc currentBound = boundBy.next(); // load first bounding interval
|
||||||
|
fetchNextOverlap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found
|
||||||
|
* until the end of the either of the two streams, leaves prefetchedOverlap set to null
|
||||||
|
*/
|
||||||
|
private void fetchNextOverlap() {
|
||||||
|
|
||||||
|
prefetchedOverlap = null;
|
||||||
|
|
||||||
|
while ( prefetchedOverlap == null ) {
|
||||||
|
|
||||||
|
if ( currentInterval.isBefore(currentBound) ) {
|
||||||
|
if ( ! iter.hasNext() ) break; // no more intervals left; we are done
|
||||||
|
currentInterval = iter.next();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( currentInterval.isPast(currentBound) ) {
|
||||||
|
if ( ! boundBy.hasNext() ) break; // we are past the last available bounding interval, we are done!
|
||||||
|
currentBound = boundBy.next();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we are at this point only if currentInterval overlaps with currentBound
|
||||||
|
|
||||||
|
prefetchedOverlap = currentInterval.intersect(currentBound);
|
||||||
|
|
||||||
|
// we still do not know if we are done with either current interval or current bound, because
|
||||||
|
// two special situations are possible:
|
||||||
|
//
|
||||||
|
// 1) next interval overlaps with 2) current interval also overlaps with
|
||||||
|
// the same bounding interval; next bounding interval; note that
|
||||||
|
// note that in this case next in this case next bound necessarily
|
||||||
|
// interval necessarily starts before starts before the next interval
|
||||||
|
// the next bound
|
||||||
|
//
|
||||||
|
// curr. int next int. curr. int
|
||||||
|
// ----- ------ --------------------------
|
||||||
|
// ------------------- --------- -------------
|
||||||
|
// curr. bound curr. bound next bound
|
||||||
|
|
||||||
|
// To solve this issue we update either only currentInterval or only currentBound to their next value,
|
||||||
|
// whichever comes first; the rest of the traversal to the next overlap will be performed on the next invocation of
|
||||||
|
// fetchNextOverlap().
|
||||||
|
|
||||||
|
if ( ! iter.hasNext() ) {
|
||||||
|
|
||||||
|
}
|
||||||
|
GenomeLoc nextInterval = iter.next();
|
||||||
|
GenomeLoc nextBound = boundBy.next();
|
||||||
|
|
||||||
|
if ( nextInterval.startsBefore(nextBound)) {
|
||||||
|
currentInterval = nextInterval;
|
||||||
|
boundBy.pushback(nextBound); // in case next interval overlaps with the current bound
|
||||||
|
} else {
|
||||||
|
currentBound = nextBound;
|
||||||
|
iter.pushback(nextInterval); // in case current interval also overlaps with the next bound
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns <tt>true</tt> if the iteration has more elements. (In other
|
||||||
|
* words, returns <tt>true</tt> if <tt>next</tt> would return an element
|
||||||
|
* rather than throwing an exception.)
|
||||||
|
*
|
||||||
|
* @return <tt>true</tt> if the iterator has more elements.
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return false; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next element in the iteration.
|
||||||
|
*
|
||||||
|
* @return the next element in the iteration.
|
||||||
|
* @throws java.util.NoSuchElementException
|
||||||
|
* iteration has no more elements.
|
||||||
|
*/
|
||||||
|
public GenomeLoc next() {
|
||||||
|
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes from the underlying collection the last element returned by the
|
||||||
|
* iterator (optional operation). This method can be called only once per
|
||||||
|
* call to <tt>next</tt>. The behavior of an iterator is unspecified if
|
||||||
|
* the underlying collection is modified while the iteration is in
|
||||||
|
* progress in any way other than by calling this method.
|
||||||
|
*
|
||||||
|
* @throws UnsupportedOperationException if the <tt>remove</tt>
|
||||||
|
* operation is not supported by this Iterator.
|
||||||
|
* @throws IllegalStateException if the <tt>next</tt> method has not
|
||||||
|
* yet been called, or the <tt>remove</tt> method has already
|
||||||
|
* been called after the last call to the <tt>next</tt>
|
||||||
|
* method.
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator");
|
||||||
|
//To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -41,7 +41,7 @@ import java.util.Arrays;
|
||||||
|
|
||||||
public class AlignmentUtils {
|
public class AlignmentUtils {
|
||||||
|
|
||||||
private static class MismatchCount {
|
public static class MismatchCount {
|
||||||
public int numMismatches = 0;
|
public int numMismatches = 0;
|
||||||
public long mismatchQualities = 0;
|
public long mismatchQualities = 0;
|
||||||
}
|
}
|
||||||
|
|
@ -98,13 +98,13 @@ public class AlignmentUtils {
|
||||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||||
return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
|
return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
|
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
|
||||||
// todo -- high performance implementation. We can do a lot better than this right now
|
// todo -- high performance implementation. We can do a lot better than this right now
|
||||||
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
|
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
|
||||||
MismatchCount mc = new MismatchCount();
|
MismatchCount mc = new MismatchCount();
|
||||||
|
|
||||||
int readIdx = 0;
|
int readIdx = 0;
|
||||||
|
|
@ -387,6 +387,15 @@ public class AlignmentUtils {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns true is read is mapped and mapped uniquely (Q>0).
|
||||||
|
*
|
||||||
|
* @param read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static boolean isReadUniquelyMapped(SAMRecord read) {
|
||||||
|
return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
|
/** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
|
||||||
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
|
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
|
||||||
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
|
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue