made a fake fasta generator, so we can now generate a complete bam / fasta combo of made up data.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1150 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c2e5a68aaf
commit
d4d3af20f2
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.broadinstitute.sting.utils.fasta;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class ArtificialFastaUtils
|
||||||
|
* <p/>
|
||||||
|
* artificial fasta utility class, for generating fake fastas.
|
||||||
|
*/
|
||||||
|
public class ArtificialFastaUtils {
|
||||||
|
public enum BASE_PATTERN {
|
||||||
|
RANDOM, ALL_A, ALL_T, ALL_C, ALL_G;
|
||||||
|
}
|
||||||
|
|
||||||
|
// what bases we support
|
||||||
|
public enum BASES {
|
||||||
|
A, T, C, G;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create an artificial fasta file
|
||||||
|
public static void createArtificialFasta(String fileName,
|
||||||
|
List<String> contigNames,
|
||||||
|
List<Integer> contigSizes,
|
||||||
|
BASE_PATTERN pattern) {
|
||||||
|
PrintStream s;
|
||||||
|
try {
|
||||||
|
s = new PrintStream(new FileOutputStream(fileName));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new StingException("Filename " + fileName + " passed to the ArtificialFastaUtils generated a FileNotFound exception", e);
|
||||||
|
}
|
||||||
|
generateFakeFasta(contigNames, contigSizes, pattern, s);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create an artificial fasta file
|
||||||
|
public static void createArtificialFasta(PrintStream stream,
|
||||||
|
List<String> contigNames,
|
||||||
|
List<Integer> contigSizes,
|
||||||
|
BASE_PATTERN pattern) {
|
||||||
|
|
||||||
|
generateFakeFasta(contigNames, contigSizes, pattern, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a fake fasta file
|
||||||
|
*
|
||||||
|
* @param contigNames the pile of contig names
|
||||||
|
* @param contigSizes the pile of contig sizes
|
||||||
|
* @param pattern the pattern to use for the base distrobution
|
||||||
|
* @param s the print stream to write to
|
||||||
|
*/
|
||||||
|
private static void generateFakeFasta(List<String> contigNames, List<Integer> contigSizes, BASE_PATTERN pattern, PrintStream s) {
|
||||||
|
if (contigNames.size() != contigSizes.size()) {
|
||||||
|
throw new StingException("ArtificialContig name and size arrays are not equal sizes");
|
||||||
|
}
|
||||||
|
for (int x = 0; x < contigNames.size(); x++) {
|
||||||
|
ArtificialContig tig = new ArtificialContig(contigNames.get(x), contigSizes.get(x), pattern);
|
||||||
|
tig.write(s);
|
||||||
|
}
|
||||||
|
s.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** the fake contig class, a fasta is made up of these */
|
||||||
|
class ArtificialContig {
|
||||||
|
public static final int COLUMN_WIDTH = 80;
|
||||||
|
|
||||||
|
final protected String mName;
|
||||||
|
final protected int mSize;
|
||||||
|
final protected ArtificialFastaUtils.BASE_PATTERN mPattern;
|
||||||
|
|
||||||
|
public ArtificialContig(String name, int size, ArtificialFastaUtils.BASE_PATTERN pat) {
|
||||||
|
this.mName = name;
|
||||||
|
this.mSize = size;
|
||||||
|
this.mPattern = pat;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* write out the contig to a stream
|
||||||
|
*
|
||||||
|
* @param stream
|
||||||
|
*/
|
||||||
|
public void write(PrintStream stream) {
|
||||||
|
stream.println(">" + mName);
|
||||||
|
int count = 0;
|
||||||
|
while (count < mSize) {
|
||||||
|
for (int x = 0; x < COLUMN_WIDTH; x++) {
|
||||||
|
stream.print(generateAppropriateBase());
|
||||||
|
count++;
|
||||||
|
if (count >= mSize) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stream.println();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* generate the appropriate base, given the BASE_PATTERN
|
||||||
|
*
|
||||||
|
* @return a base, as a string
|
||||||
|
*/
|
||||||
|
public String generateAppropriateBase() {
|
||||||
|
switch (mPattern) {
|
||||||
|
case RANDOM:
|
||||||
|
return (ArtificialFastaUtils.BASES.values()[(int) Math.round(Math.random() * 4)]).toString();
|
||||||
|
case ALL_A:
|
||||||
|
return "A";
|
||||||
|
case ALL_T:
|
||||||
|
return "T";
|
||||||
|
case ALL_C:
|
||||||
|
return "C";
|
||||||
|
case ALL_G:
|
||||||
|
return "G";
|
||||||
|
default:
|
||||||
|
throw new StingException("Unknown base pattern");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author aaron
|
||||||
|
*
|
||||||
|
* Class ArtificialSAMGenerator
|
||||||
|
*
|
||||||
|
* This provides for an external utility, that creates sam files and associates fasta files
|
||||||
|
*/
|
||||||
|
public class ArtificialSAMGenerator {
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ArtificialFASTAUtils {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -161,7 +161,7 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements
|
||||||
contigIndex = rec.getSequenceIndex();
|
contigIndex = rec.getSequenceIndex();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); }
|
if (contigIndex < 0) { throw new IllegalArgumentException("ArtificialContig" + contig + " doesn't exist"); }
|
||||||
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
|
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
|
||||||
super.next();
|
super.next();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,30 +1,12 @@
|
||||||
package org.broadinstitute.sting.utils.sam;
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
/**
|
import java.io.File;
|
||||||
*
|
import java.util.ArrayList;
|
||||||
* User: aaron
|
import java.util.List;
|
||||||
* Date: May 21, 2009
|
|
||||||
* Time: 2:57:48 PM
|
|
||||||
*
|
|
||||||
* The Broad Institute
|
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
|
|
@ -46,11 +28,11 @@ public class ArtificialSAMUtils {
|
||||||
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
||||||
File outFile = new File(filename);
|
File outFile = new File(filename);
|
||||||
|
|
||||||
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, false, outFile);
|
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile);
|
||||||
|
|
||||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||||
for (int readNumber = 0; readNumber < readsPerChomosome; readNumber++) {
|
for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) {
|
||||||
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
|
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -73,7 +55,7 @@ public class ArtificialSAMUtils {
|
||||||
SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile);
|
SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile);
|
||||||
|
|
||||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||||
for (int readNumber = 0; readNumber < readsPerChomosome; readNumber++) {
|
for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) {
|
||||||
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
|
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -92,6 +74,7 @@ public class ArtificialSAMUtils {
|
||||||
*/
|
*/
|
||||||
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
|
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
|
||||||
SAMFileHeader header = new SAMFileHeader();
|
SAMFileHeader header = new SAMFileHeader();
|
||||||
|
header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate);
|
||||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||||
// make up some sequence records
|
// make up some sequence records
|
||||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||||
|
|
@ -161,7 +144,7 @@ public class ArtificialSAMUtils {
|
||||||
*/
|
*/
|
||||||
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||||
if (alignmentStart == 0)
|
if (alignmentStart == 0)
|
||||||
throw new StingException("Invalid alignment start for artificial read");
|
throw new StingException("Invalid alignment start for artificial read, start = " + alignmentStart);
|
||||||
SAMRecord record = new SAMRecord(header);
|
SAMRecord record = new SAMRecord(header);
|
||||||
record.setReadName(name);
|
record.setReadName(name);
|
||||||
record.setReferenceIndex(refIndex);
|
record.setReferenceIndex(refIndex);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
package org.broadinstitute.sting.utils.fasta;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class ArtificialFastaUtilsTest
|
||||||
|
* <p/>
|
||||||
|
* test out the ArtificialFastaUtils functionality
|
||||||
|
*/
|
||||||
|
public class ArtificialFastaUtilsTest extends BaseTest {
|
||||||
|
|
||||||
|
/** generate a fake fasta */
|
||||||
|
@Test
|
||||||
|
public void testFastaGeneration() {
|
||||||
|
List<String> names = new ArrayList<String>();
|
||||||
|
List<Integer> sizes = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
for (int x = 0; x < 5; x++) {
|
||||||
|
sizes.add(1000);
|
||||||
|
names.add("chr" + (x+1));
|
||||||
|
}
|
||||||
|
File temp = new File("tempFileFasta.fasta");
|
||||||
|
ArtificialFastaUtils.createArtificialFasta(temp.getName(),names,sizes,ArtificialFastaUtils.BASE_PATTERN.ALL_A);
|
||||||
|
|
||||||
|
// using the fasta sequence file to test, in reality we should use the indexed version
|
||||||
|
FastaSequenceFile2 fasta = new FastaSequenceFile2(temp);
|
||||||
|
|
||||||
|
Assert.assertEquals(5,fasta.getSequenceDictionary().getSequences().size());
|
||||||
|
|
||||||
|
ArtificialSAMUtils.createArtificialBamFile("tempFileBAM.bam",5,1,1000,600);
|
||||||
|
//temp.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue