When you get the reference string for a read that is mapped partially off the end of a contig, the string is masked with X's for base positions without corresponding reference positions.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1121 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1dcababad1
commit
03f8177a53
|
|
@ -4,41 +4,63 @@ import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
import net.sf.samtools.util.StringUtil;
|
import net.sf.samtools.util.StringUtil;
|
||||||
import net.sf.picard.reference.ReferenceSequence;
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* User: hanna
|
* User: hanna
|
||||||
* Date: May 22, 2009
|
* Date: May 22, 2009
|
||||||
* Time: 12:36:14 PM
|
* Time: 12:36:14 PM
|
||||||
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
|
|
||||||
* Software and documentation are copyright 2005 by the Broad Institute.
|
|
||||||
* All rights are reserved.
|
|
||||||
*
|
*
|
||||||
* Users acknowledge that this software is supplied without any warranty or support.
|
|
||||||
* The Broad Institute is not responsible for its use, misuse, or
|
|
||||||
* functionality.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/** Provides access to the reference over a single read. */
|
||||||
* Provides access to the reference over a single read.
|
|
||||||
*/
|
|
||||||
|
|
||||||
public class ReadReferenceView extends ReferenceView {
|
public class ReadReferenceView extends ReferenceView {
|
||||||
/**
|
/**
|
||||||
* Create a view of the reference with respect to a single read.
|
* Create a view of the reference with respect to a single read.
|
||||||
* @param provider
|
*
|
||||||
|
* @param provider
|
||||||
*/
|
*/
|
||||||
public ReadReferenceView( ShardDataProvider provider ) {
|
public ReadReferenceView( ShardDataProvider provider ) {
|
||||||
super( provider );
|
super(provider);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the bases of the reference that are aligned to the given read.
|
* Gets the bases of the reference that are aligned to the given read.
|
||||||
|
*
|
||||||
* @param read the read for which to extract reference information.
|
* @param read the read for which to extract reference information.
|
||||||
|
*
|
||||||
* @return The bases corresponding to this read, or null if the read is unmapped.
|
* @return The bases corresponding to this read, or null if the read is unmapped.
|
||||||
* If the alignment goes off the end of the contig, return just the portion
|
* If the alignment goes off the end of the contig, return just the portion
|
||||||
* mapped to the reference.
|
* mapped to the reference, followed by X's coresponding to the rest of the read.
|
||||||
|
* This indicates that the rest lies off the end of the contig.
|
||||||
*/
|
*/
|
||||||
public char[] getReferenceBases( SAMRecord read ) {
|
public char[] getReferenceBases( SAMRecord read ) {
|
||||||
if( read.getReadUnmappedFlag() )
|
if (read.getReadUnmappedFlag())
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
String contig = read.getReferenceName();
|
String contig = read.getReferenceName();
|
||||||
|
|
@ -46,11 +68,11 @@ public class ReadReferenceView extends ReferenceView {
|
||||||
int stop = read.getAlignmentEnd();
|
int stop = read.getAlignmentEnd();
|
||||||
|
|
||||||
SAMSequenceRecord sequenceRecord = reference.getSequenceDictionary().getSequence(contig);
|
SAMSequenceRecord sequenceRecord = reference.getSequenceDictionary().getSequence(contig);
|
||||||
if( stop > sequenceRecord.getSequenceLength() )
|
if (stop > sequenceRecord.getSequenceLength())
|
||||||
stop = sequenceRecord.getSequenceLength();
|
stop = sequenceRecord.getSequenceLength();
|
||||||
|
|
||||||
ReferenceSequence alignmentToReference = reference.getSubsequenceAt( contig, start, stop );
|
ReferenceSequence alignmentToReference = reference.getSubsequenceAt(contig, start, stop);
|
||||||
return StringUtil.bytesToString(alignmentToReference.getBases()).toCharArray();
|
return ( StringUtil.bytesToString(alignmentToReference.getBases()) + Utils.dupString('X', read.getAlignmentEnd() - stop) ).toCharArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,25 +2,42 @@ package org.broadinstitute.sting.gatk.datasources.providers;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.*;
|
||||||
import net.sf.samtools.SAMFileHeader;
|
|
||||||
import net.sf.samtools.Cigar;
|
|
||||||
import net.sf.samtools.CigarElement;
|
|
||||||
import net.sf.samtools.CigarOperator;
|
|
||||||
import net.sf.samtools.util.StringUtil;
|
import net.sf.samtools.util.StringUtil;
|
||||||
import net.sf.picard.reference.ReferenceSequence;
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* User: hanna
|
* User: hanna
|
||||||
* Date: May 27, 2009
|
* Date: May 27, 2009
|
||||||
* Time: 1:04:27 PM
|
* Time: 1:04:27 PM
|
||||||
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
|
|
||||||
* Software and documentation are copyright 2005 by the Broad Institute.
|
|
||||||
* All rights are reserved.
|
|
||||||
*
|
*
|
||||||
* Users acknowledge that this software is supplied without any warranty or support.
|
|
||||||
* The Broad Institute is not responsible for its use, misuse, or
|
|
||||||
* functionality.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -28,9 +45,54 @@ import net.sf.picard.reference.ReferenceSequence;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ReadReferenceViewTest extends ReferenceViewTemplate {
|
public class ReadReferenceViewTest extends ReferenceViewTemplate {
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tests that the ReadReferenceView correctly generates X's when a read overhangs the
|
||||||
|
* end of a contig
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testOverhangingRead() {
|
||||||
|
testOverhangingGivenSize(25,0);
|
||||||
|
testOverhangingGivenSize(25,12);
|
||||||
|
testOverhangingGivenSize(25,24);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a private method, that tests getting the read sequence for reads that overlap the end of the
|
||||||
|
* contig
|
||||||
|
* @param readLength the length of the read
|
||||||
|
* @param overlap the amount of overlap
|
||||||
|
*/
|
||||||
|
private void testOverhangingGivenSize(int readLength, int overlap) {
|
||||||
|
SAMSequenceRecord selectedContig = sequenceFile.getSequenceDictionary().getSequences().get(sequenceFile.getSequenceDictionary().getSequences().size()-1);
|
||||||
|
final long contigStart = selectedContig.getSequenceLength() - (readLength - overlap - 1);
|
||||||
|
final long contigStop = selectedContig.getSequenceLength() + overlap;
|
||||||
|
|
||||||
|
ShardDataProvider dataProvider = new ShardDataProvider(null,null,sequenceFile,null);
|
||||||
|
ReadReferenceView view = new ReadReferenceView(dataProvider);
|
||||||
|
|
||||||
|
SAMRecord rec = buildSAMRecord(selectedContig.getSequenceName(),(int)contigStart,(int)contigStop);
|
||||||
|
ReferenceSequence expectedAsSeq = sequenceFile.getSubsequenceAt(selectedContig.getSequenceName(),(int)contigStart,selectedContig.getSequenceLength());
|
||||||
|
char[] expected = StringUtil.bytesToString(expectedAsSeq.getBases()).toCharArray();
|
||||||
|
char[] actual = view.getReferenceBases(rec);
|
||||||
|
|
||||||
|
Assert.assertEquals(expected.length, (readLength - overlap));
|
||||||
|
Assert.assertEquals(actual.length, readLength);
|
||||||
|
int xRange = 0;
|
||||||
|
for (; xRange < (readLength - overlap); xRange++) {
|
||||||
|
Assert.assertTrue(actual[xRange] != 'X');
|
||||||
|
}
|
||||||
|
for (; xRange < actual.length; xRange++) {
|
||||||
|
Assert.assertTrue(actual[xRange] == 'X');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compares the contents of the fasta and view at a specified location.
|
* Compares the contents of the fasta and view at a specified location.
|
||||||
* @param loc
|
* @param loc the location to validate
|
||||||
*/
|
*/
|
||||||
protected void validateLocation( GenomeLoc loc ) {
|
protected void validateLocation( GenomeLoc loc ) {
|
||||||
SAMRecord read = buildSAMRecord( loc.getContig(), (int)loc.getStart(), (int)loc.getStop() );
|
SAMRecord read = buildSAMRecord( loc.getContig(), (int)loc.getStart(), (int)loc.getStop() );
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue