PlinkRodWithGenomeLoc now properly handels indels.

There is now a DELETION_REFERENCE allele type to allow for the storage of multi-base references rather than point-mutation references.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2667 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-01-23 07:34:52 +00:00
parent 42fb85e7f3
commit 01db93299c
3 changed files with 53 additions and 7 deletions

View File

@ -17,7 +17,7 @@ public class Allele {
// the types of variants we currently allow
public enum AlleleType {
REFERENCE, SNP, INSERTION, DELETION, INVERSION, UNKNOWN_POINT_ALLELE
REFERENCE, SNP, INSERTION, DELETION, INVERSION, UNKNOWN_POINT_ALLELE, DELETION_REFERENCE
}
public Allele(AlleleType type, String bases) {

View File

@ -21,7 +21,6 @@ import net.sf.samtools.SAMFileHeader;
* To change this template use File | Settings | File Templates.
*/
public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
private boolean allowTest = false;
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
private final byte SNP_MAJOR_MODE = 0x00000001;
@ -97,6 +96,10 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
return currentVariant.getGenotypes();
}
public boolean variantIsSNP() {
return currentVariant.isSNP();
}
// AM I PARSING A TEXT OR A BINARY FILE ??
@ -226,6 +229,9 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
ArrayList<String> sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile);
java.util.Collections.sort(updatedVariants);
return updatedVariants;
}
@ -429,7 +435,7 @@ class PlinkVariantInfo implements Comparable {
return genotypes;
}
private boolean isSNP() {
public boolean isSNP() {
return this.indelType == null;
}
@ -516,6 +522,7 @@ class PlinkVariantInfo implements Comparable {
for ( String alStr : alleleStrings ) {
alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr));
}
genotypes.add(new Genotype(alleles,sampleName,20.0) );
sampleNames.add(sampleName);
}
@ -544,7 +551,7 @@ class PlinkVariantInfo implements Comparable {
alt = new Allele(indelType,baseStr);
} else {
alt = new Allele(indelType,"");
ref = new Allele(Allele.AlleleType.REFERENCE,baseStr);
ref = new Allele(Allele.AlleleType.DELETION_REFERENCE,baseStr);
}
this.setIndelLength(alt,baseStr.length());
@ -571,8 +578,8 @@ class PlinkVariantInfo implements Comparable {
allele2 = new Allele(indelType,"");
reference = false;
} else {
allele1 = new Allele(Allele.AlleleType.REFERENCE,strand1);
allele2 = new Allele(Allele.AlleleType.REFERENCE,strand2);
allele1 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand1);
allele2 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand2);
reference = true;
}
} else {
@ -597,6 +604,7 @@ class PlinkVariantInfo implements Comparable {
if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site
Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0);
setIndelGenotypeLength(gen,siteIndelLength);
this.genotypes.add(gen);
this.sampleNames.add(sampleName);
} else { // hold on the variants until we *do* know the in/del length at this site

View File

@ -85,7 +85,7 @@ public class PlinkRodTest extends BaseTest {
ArrayList<Genotype> snp3 = genotypesInRod.get(2);
Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
Assert.assertEquals("That there are three samples in SNP 1", 3, sampleNamesInRod.get(0).size());
Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size());
Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
@ -114,4 +114,42 @@ public class PlinkRodTest extends BaseTest {
lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
}
}
@Test
public void testStandardPedFileWithIndels() {
PlinkRodWithGenomeLoc rod = new PlinkRodWithGenomeLoc("test");
try {
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") );
} catch ( FileNotFoundException e) {
throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e);
}
// Iterate through the rod
List<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
ArrayList<Boolean> snpSites = new ArrayList<Boolean>();
do {
genotypesInRod.add(rod.getGenotypes());
sampleNamesInRod.add(rod.getVariantSampleNames());
lociInRod.add(rod.getLocation());
snpSites.add(rod.variantIsSNP());
} while ( rod.parseLine(null,null) );
boolean snpOrder = true;
List<Boolean> expectedOrder = Arrays.asList(true,false,true,false);
for ( int i = 0; i < 4; i ++ ) {
snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) );
}
Assert.assertTrue("That the variant type order is as expected", snpOrder);
Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1).isPointGenotype() );
Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() );
Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom());
Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet());
Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases());
Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases());
Assert.assertEquals("That the snp bases are correct","GC",genotypesInRod.get(2).get(2).getAlleles().get(0).getBases()+genotypesInRod.get(2).get(2).getAlleles().get(1).getBases());
}
}