PlinkRodWithGenomeLoc now properly parses text plink files. Unit test added to test this functionality. Indels and binary files to come.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2666 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
648a36d08e
commit
42fb85e7f3
|
|
@ -4,10 +4,15 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
|
|
@ -16,7 +21,7 @@ import java.util.*;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
|
||||
|
||||
private boolean allowTest = false;
|
||||
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
|
||||
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
|
||||
private final byte SNP_MAJOR_MODE = 0x00000001;
|
||||
|
|
@ -25,7 +30,6 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
private PlinkVariantInfo currentVariant;
|
||||
private ListIterator<PlinkVariantInfo> variantIterator;
|
||||
|
||||
|
||||
private PlinkFileType plinkFileType;
|
||||
|
||||
public enum PlinkFileType {
|
||||
|
|
@ -93,10 +97,11 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
return currentVariant.getGenotypes();
|
||||
}
|
||||
|
||||
|
||||
// AM I PARSING A TEXT OR A BINARY FILE ??
|
||||
|
||||
private ArrayList<PlinkVariantInfo> parsePlinkFile(File file) {
|
||||
String[] splitFileName = file.getName().split(".");
|
||||
String[] splitFileName = file.getName().split("\\.");
|
||||
String extension = splitFileName[splitFileName.length-1];
|
||||
if ( extension.equals("ped") || extension.equals("raw") ) {
|
||||
return parseTextFormattedPlinkFile(file);
|
||||
|
|
@ -127,6 +132,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
incorporateInfo(seqVars,snpOffsets,line);
|
||||
} while ( line != null );
|
||||
|
||||
|
||||
java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these
|
||||
// are sorted in standard reference order
|
||||
|
||||
|
|
@ -140,6 +146,10 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
}
|
||||
|
||||
private void incorporateInfo(List<PlinkVariantInfo> vars, List<Integer> offsets, String plinkLine) {
|
||||
if ( plinkLine == null ) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] plinkInfo;
|
||||
if ( plinkFileType == PlinkFileType.STANDARD_PED) {
|
||||
plinkInfo = plinkLine.split("\t");
|
||||
|
|
@ -190,7 +200,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
String[] headerFields;
|
||||
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
|
||||
headerFields = header.split("\t");
|
||||
headerFields = header.split("\t+");
|
||||
} else {
|
||||
headerFields = header.split("\\s+");
|
||||
}
|
||||
|
|
@ -439,6 +449,7 @@ class PlinkVariantInfo implements Comparable {
|
|||
|
||||
public PlinkVariantInfo(String variantName) {
|
||||
this.variantName = variantName;
|
||||
parseName();
|
||||
}
|
||||
|
||||
public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) {
|
||||
|
|
@ -451,7 +462,7 @@ class PlinkVariantInfo implements Comparable {
|
|||
}
|
||||
|
||||
private void parseName() {
|
||||
String chrom = this.variantName.split("_c")[1].split("_")[0];
|
||||
String chrom = this.variantName.split("\\|c")[1].split("_")[0];
|
||||
String pos = this.variantName.split("_p")[1].split("_")[0];
|
||||
this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
|
||||
this.parseNameForIndels();
|
||||
|
|
@ -505,6 +516,8 @@ class PlinkVariantInfo implements Comparable {
|
|||
for ( String alStr : alleleStrings ) {
|
||||
alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr));
|
||||
}
|
||||
genotypes.add(new Genotype(alleles,sampleName,20.0) );
|
||||
sampleNames.add(sampleName);
|
||||
}
|
||||
|
||||
private void addIndel(String[] alleleStrings, String sampleName) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,117 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: Ghost
|
||||
* Date: Jan 22, 2010
|
||||
* Time: 11:27:33 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PlinkRodTest extends BaseTest {
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeTests() {
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("unable to load the sequence dictionary");
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
|
||||
}
|
||||
|
||||
public BufferedReader openFile(String filename) {
|
||||
try {
|
||||
return new BufferedReader(new FileReader(filename));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Couldn't open file " + filename);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardPedFile() {
|
||||
PlinkRodWithGenomeLoc rod = new PlinkRodWithGenomeLoc("test");
|
||||
try {
|
||||
rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("test file for testStandardPedFile() does not exist",e);
|
||||
}
|
||||
|
||||
// test that the sample names are correct
|
||||
|
||||
List<String> rodNames = rod.getVariantSampleNames();
|
||||
List<String> expectedNames = Arrays.asList("NA12887","NAMELY","COWBA");
|
||||
|
||||
Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size());
|
||||
|
||||
boolean namesCorrect = true;
|
||||
for ( int i = 0; i < expectedNames.size(); i++ ) {
|
||||
namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) );
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the names are correct and in the proper order",namesCorrect);
|
||||
|
||||
// test that rod can be iterated over
|
||||
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
lociInRod.add(rod.getLocation());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size());
|
||||
|
||||
ArrayList<Genotype> snp1 = genotypesInRod.get(0);
|
||||
ArrayList<Genotype> snp3 = genotypesInRod.get(2);
|
||||
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
|
||||
Assert.assertEquals("That there are three samples in SNP 1", 3, sampleNamesInRod.get(0).size());
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
|
||||
|
||||
|
||||
List<Allele> snp1_individual3_alleles = snp1.get(2).getAlleles();
|
||||
List<Allele> snp3_individual2_alleles = snp3.get(1).getAlleles();
|
||||
|
||||
String alleleStr1 = snp1_individual3_alleles.get(0).getBases()+snp1_individual3_alleles.get(1).getBases();
|
||||
String alleleStr2 = snp3_individual2_alleles.get(0).getBases()+snp3_individual2_alleles.get(1).getBases();
|
||||
|
||||
Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1);
|
||||
Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2);
|
||||
|
||||
boolean name2isSame = true;
|
||||
|
||||
for ( ArrayList<String> names : sampleNamesInRod ) {
|
||||
name2isSame = name2isSame && names.get(1).equals("NAMELY");
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame);
|
||||
|
||||
// test that the loci are correctly parsed and in order
|
||||
|
||||
List<String> expectedLoci = Arrays.asList("1:123456","2:13274","3:11111");
|
||||
boolean lociCorrect = true;
|
||||
for ( int i = 0; i < 3; i ++ ) {
|
||||
lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue