diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java index 7bc30c9b7..7bd64aaa3 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRodWithGenomeLoc.java @@ -4,10 +4,15 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.CommandLineGATK; import java.io.*; import java.util.*; +import net.sf.samtools.SAMFileHeader; + /** * Created by IntelliJ IDEA. * User: chartl @@ -16,7 +21,7 @@ import java.util.*; * To change this template use File | Settings | File Templates. */ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum { - + private boolean allowTest = false; private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE")); private final byte SNP_MAJOR_MODE = 0x00000001; @@ -25,7 +30,6 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements private PlinkVariantInfo currentVariant; private ListIterator variantIterator; - private PlinkFileType plinkFileType; public enum PlinkFileType { @@ -93,10 +97,11 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements return currentVariant.getGenotypes(); } + // AM I PARSING A TEXT OR A BINARY FILE ?? private ArrayList parsePlinkFile(File file) { - String[] splitFileName = file.getName().split("."); + String[] splitFileName = file.getName().split("\\."); String extension = splitFileName[splitFileName.length-1]; if ( extension.equals("ped") || extension.equals("raw") ) { return parseTextFormattedPlinkFile(file); @@ -127,6 +132,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements incorporateInfo(seqVars,snpOffsets,line); } while ( line != null ); + java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these // are sorted in standard reference order @@ -140,6 +146,10 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements } private void incorporateInfo(List vars, List offsets, String plinkLine) { + if ( plinkLine == null ) { + return; + } + String[] plinkInfo; if ( plinkFileType == PlinkFileType.STANDARD_PED) { plinkInfo = plinkLine.split("\t"); @@ -190,7 +200,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ArrayList offsets = new ArrayList(); String[] headerFields; if ( plinkFileType == PlinkFileType.STANDARD_PED ) { - headerFields = header.split("\t"); + headerFields = header.split("\t+"); } else { headerFields = header.split("\\s+"); } @@ -439,6 +449,7 @@ class PlinkVariantInfo implements Comparable { public PlinkVariantInfo(String variantName) { this.variantName = variantName; + parseName(); } public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) { @@ -451,7 +462,7 @@ class PlinkVariantInfo implements Comparable { } private void parseName() { - String chrom = this.variantName.split("_c")[1].split("_")[0]; + String chrom = this.variantName.split("\\|c")[1].split("_")[0]; String pos = this.variantName.split("_p")[1].split("_")[0]; this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); this.parseNameForIndels(); @@ -505,6 +516,8 @@ class PlinkVariantInfo implements Comparable { for ( String alStr : alleleStrings ) { alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr)); } + genotypes.add(new Genotype(alleles,sampleName,20.0) ); + sampleNames.add(sampleName); } private void addIndel(String[] alleleStrings, String sampleName) { diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java new file mode 100755 index 000000000..3d98a8013 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodTest.java @@ -0,0 +1,117 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.Assert; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: Ghost + * Date: Jan 22, 2010 + * Time: 11:27:33 PM + * To change this template use File | Settings | File Templates. + */ +public class PlinkRodTest extends BaseTest { + private static IndexedFastaSequenceFile seq; + + @BeforeClass + public static void beforeTests() { + try { + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); + } catch (FileNotFoundException e) { + throw new StingException("unable to load the sequence dictionary"); + } + GenomeLocParser.setupRefContigOrdering(seq); + + } + + public BufferedReader openFile(String filename) { + try { + return new BufferedReader(new FileReader(filename)); + } catch (FileNotFoundException e) { + throw new StingException("Couldn't open file " + filename); + } + + } + + @Test + public void testStandardPedFile() { + PlinkRodWithGenomeLoc rod = new PlinkRodWithGenomeLoc("test"); + try { + rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") ); + } catch ( FileNotFoundException e ) { + throw new StingException("test file for testStandardPedFile() does not exist",e); + } + + // test that the sample names are correct + + List rodNames = rod.getVariantSampleNames(); + List expectedNames = Arrays.asList("NA12887","NAMELY","COWBA"); + + Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size()); + + boolean namesCorrect = true; + for ( int i = 0; i < expectedNames.size(); i++ ) { + namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) ); + } + + Assert.assertTrue("That the names are correct and in the proper order",namesCorrect); + + // test that rod can be iterated over + + ArrayList> genotypesInRod = new ArrayList>(); + ArrayList> sampleNamesInRod = new ArrayList>(); + ArrayList lociInRod = new ArrayList(); + do { + genotypesInRod.add(rod.getGenotypes()); + sampleNamesInRod.add(rod.getVariantSampleNames()); + lociInRod.add(rod.getLocation()); + } while ( rod.parseLine(null,null) ); + + Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size()); + + ArrayList snp1 = genotypesInRod.get(0); + ArrayList snp3 = genotypesInRod.get(2); + + Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size()); + Assert.assertEquals("That there are three samples in SNP 1", 3, sampleNamesInRod.get(0).size()); + Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size()); + + + List snp1_individual3_alleles = snp1.get(2).getAlleles(); + List snp3_individual2_alleles = snp3.get(1).getAlleles(); + + String alleleStr1 = snp1_individual3_alleles.get(0).getBases()+snp1_individual3_alleles.get(1).getBases(); + String alleleStr2 = snp3_individual2_alleles.get(0).getBases()+snp3_individual2_alleles.get(1).getBases(); + + Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1); + Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2); + + boolean name2isSame = true; + + for ( ArrayList names : sampleNamesInRod ) { + name2isSame = name2isSame && names.get(1).equals("NAMELY"); + } + + Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame); + + // test that the loci are correctly parsed and in order + + List expectedLoci = Arrays.asList("1:123456","2:13274","3:11111"); + boolean lociCorrect = true; + for ( int i = 0; i < 3; i ++ ) { + lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i)); + } + } +}