#!/usr/bin/perl # protloctab.pl =head1 protloctab generate gene order table for 12 dros species from tblastn located genes of DrosMel proteins. =cut @gtabflds= ("species", "chr", "source", "type", "start", "end", "score", "strand", "protid"); @allspp = qw( celegans dpulex agam); push(@allspp, "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri"); %wspp=(); while(<>){ chomp; my($species, $chr, $source, $type, $start, $end, $score, $strand, $protid) = split"\t"; $protid =~ s/\-P.$//; $tab{$protid}{$species}= [$chr,$start,$end,$strand,$score]; $ids{$protid}++; $wspp{$species}++; } @wsppset= grep {$wspp{$_} } @allspp; geneorder(); #------ sub geneorder { my @gns= (sort keys %ids); my $maxd = 50000; # 150000; #1 000 000; print join("\t","species","gene_pair","dist"),"\n"; foreach $species (@wsppset) { foreach my $ga (@gns) { my($chr,$start,$end,$strand,$score) = @{ $tab{$ga}{$species} }; next unless($chr); my @near=(); foreach my $gb (@gns) { next if($gb le $ga); ## upper triangle here my($bchr,$bstart,$bend,$bstrand,$bscore) = @{ $tab{$gb}{$species} }; next unless($bchr eq $chr); my $d= abs($start - $bstart); next if($d < 100); # overlaps push(@near,[$d,$gb]); } @near= sort { $$a[0] <=> $$b[0] } @near; for $i (0..4) { $nr= $near[$i]; ($d,$gb)= ($$nr[0],$$nr[1]); next unless($gb); print join("\t",$species,"$ga:$gb",($i+1)),"\n"; } } } } =head1 R cluster distance matrix library("cluster") flyorder <- c("dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") allorder <- c("celegans","dpulex","agam",flyorder) god <- read.delim("geneorderdm-rerun0804.tab") godmat <- xtabs(dist ~ species + gene_pair, god) godmat[godmat == 0] <- NA dogodmat <- daisy(godmat[allorder,],type=list(ordratio=1:ncol(godmat))) write.table(as.matrix(dogodmat), file="dogodmat-rerun0804") Dissimilarities : celegans dpulex agam dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dpulex 0.520 agam 0.475 0.465 dmel 0.690 0.734 0.741 dsim 0.710 0.745 0.749 0.209 dsec 0.693 0.735 0.752 0.166 0.220 dyak 0.691 0.730 0.746 0.190 0.254 0.213 dere 0.725 0.749 0.740 0.217 0.267 0.212 0.227 dana 0.732 0.727 0.758 0.290 0.314 0.295 0.288 0.285 dper 0.722 0.746 0.753 0.317 0.336 0.324 0.319 0.311 0.314 dpse 0.710 0.705 0.750 0.308 0.327 0.309 0.312 0.302 0.302 0.227 dwil 0.722 0.777 0.756 0.350 0.361 0.354 0.347 0.346 0.335 0.344 0.330 dmoj 0.726 0.750 0.752 0.336 0.347 0.336 0.333 0.336 0.326 0.333 0.315 0.337 dvir 0.751 0.736 0.754 0.334 0.351 0.332 0.337 0.337 0.331 0.337 0.326 0.338 0.287 dgri 0.732 0.758 0.769 0.343 0.356 0.343 0.340 0.336 0.333 0.331 0.321 0.337 0.312 0.305 =head1 phylip Fitch tree from distanct matrix # use Jumble randomize input matrix (((((((dmel:0.07990,dsec:0.08640):0.01029,dsim:0.12116):0.00829, dyak:0.10516):0.00851,dere:0.11087):0.03503,dana:0.14300):0.00962, ((dpse:0.10745,dper:0.11925):0.03886, (dwil:0.17348,((dvir:0.14412,dmoj:0.14258):0.00926, dgri:0.15610):0.00994):0.01107):0.00359):0.31744, (agam:0.23370,dpulex:0.23160):0.02979,celegans:0.23294); =head1 input tblastn data table location of highest scoring tblastn match of protein to each genome dgbook2% zmore dspp-protdm-matches.gtab.gz ------> dspp-protdm-matches.gtab.gz <------ dmel 2L modDM tblastn 7681 9274 0.0 + CG11023-PA dmel 2L modDM tblastn 11219 17137 0.0 - CG2671-PA dmel 2L modDM tblastn 21923 23889 5e-176 - CG2657-PA ... for each species =head1 output table: gene pair order, per species, gene_pair pair order is relative rank of absolute distance b/n genes dgbook2% grep 'CG1906:' geneorderdm-rerun0804.tab species gene_pair dist dmel CG1906:CG31041 1 dmel CG1906:CG7568 2 dmel CG1906:CG7567 3 dmel CG1906:CG1907 4 dmel CG1906:CG7577 5 dsim CG1906:CG31041 1 dsim CG1906:CG7568 2 dsim CG1906:CG7567 3 dsim CG1906:CG1907 4 dsim CG1906:CG7577 5 ... dwil CG1906:CG7568 1 dwil CG1906:CG31041 2 dwil CG1906:CG7567 3 dwil CG1906:CG1907 4 dwil CG1906:CG7514 5 dmoj CG1906:CG6036 1 dmoj CG1906:CG7568 2 dmoj CG1906:CG7567 3 dmoj CG1906:CG1907 4 dmoj CG1906:CG33337 5 =cut