#!/usr/bin/perl # blast2map.pl my $novote= 0; # simple page for viewing =item result from dgg Method Score DGIL_SNP 74 NCBI_GNO 32 EISE_CGM 10 OXFD_GPI 2 PACH_GMP 0 FlyGeneHSP 20 erecta_Acp100=15 yakuba_Acp133=20 yakuba_Acp134=24 yakuba_Acp157=13 yakuba_Acp157a=13 yakuba_Acp158=0 yakuba_Acp223=7 yakuba_Acp224=9 yakuba_Acp225=35 yakuba_Gene144=2 =cut my $footer= <<'EOF';

New gene dyak-acps-begun.fasta and Source data, code for this table

Don Gilbert, may 06

my results:

Method    Score
DGIL_SNP        74
NCBI_GNO        32
EISE_CGM        10
OXFD_GPI        2
PACH_GMP        0
FlyGeneHSP      20
EOF my $header = <<"EOF";

Drosophila gene prediction assessment

How well do gene predictors find new genes found by other methods?
Test using coding sequence identified for new Dyak (9) and Dere (1) genes from

Recently Evolved Genes Identified From Drosophila yakuba and D. erecta Accessory Gland Expressed Sequence Tags
David J. Begun, Heather A. Lindfors, Melissa E. Thompson and Alisha K. Holloway
Section of Evolution and Ecology, University of California, Davis, California 95616
Genetics, Vol. 172, 1675-1681, March 2006, Copyright 2006 doi:10.1534/genetics.105.050336

EOF if($novote) { $header .= << "EOF"; Use the web links in the "match link" column to view annotations at the site of these BLAST-located coding sequence of new genes. Score the gene predictions here.

EOF } else { $header .= << "EOF"; Assessment: Eye-ball method. Use the web links in the "match link" column to view annotations at the site of these BLAST-located coding sequence of new genes. To score annotations, choose for each annotation method how well it finds the new gene. Scores are (5) exon overlap, (2) exon within 1kb, (1) exon within 2kb, only where exon doesn't belong to other well-known gene. Clicking the Submit button will e-mail your scoring to drospege.
EOF } my $EXPAND = 2500; my $MAX_EXON_DIST = 5000; my $MIN_BITSCORE = 99; my $baseurl= "/species"; #my $baseurl= "http://melon.bio.indiana.edu"; my $gbrowsebaseurl= $baseurl."/cgi-bin/gbrowse" ; my $gbrowseimgurl= $baseurl."/cgi-bin/gbrowse_img" ; my $gbrowseurl= $gbrowsebaseurl.'/dmel/?'; my $seqparam_gff= ";plugin=BatchDumper;BatchDumper.fileformat=gff3;plugin_do=Go"; my @preds= qw( DGIL_SNP EISE_CGM NCBI_GNO OXFD_GPI PACH_GMP FlyGeneHSP ); my $predval=""; print "\n"; print $header; unless($novote) { print '

',"\n"; ## FIXME #print '',"\n"; print "\n"; print "\n"; print "\n"; print "
\n"; } print "\n"; print "\n" unless($novote); my @predcols= ($novote) ? () : @preds; print "\n"; while() { chomp; if(/^#/){ printHsp($lsid,$lqid,\@hsg,$hspid) if(@hsg); @hsg=(); $hspid= 0; print "\n" if(/BLASTN/); } elsif(/^\w/) { my @v= split "\t"; my ($qid, $sid, $pctident, $alignment_length, $mismatches, $gap_openings, $q_start, $q_end, $s_start, $s_end, $prob, $bit_score ) = @v; my($s_strand,$q_strand)= ('+','+'); if ($s_start > $s_end) { $s_strand='-'; ($s_start,$s_end)= ($s_end,$s_start); } #if ($q_start > $q_end) { $q_strand='-'; ($q_start,$q_end)= ($q_end,$q_start); } cleanid($qid); cleanid($sid); my($sorg,$schr)= split(/[:|]/, $sid,2); $sorg =~ s/_.*$//; ## need to string some of these HSPs together as group ... ##warn "nearby($qid/$lqid,$sid,$lsid,) = \n"; unless($qid eq $lqid && $sid eq $lsid && $s_strand eq $lstrand && nearby($s_start,$s_end,$lstart,$lend)) { printHsp($lsid,$lqid,\@hsg,$hspid); @hsg=(); $hspid++; } push(@hsg,[$qid, $sid, $s_start, $s_end, $s_strand, $bit_score]); # my $gurl= gbrowseFromOrgChrStartStop($sorg, $schr, $s_start, $s_end, $s_strand, $qid); # (my $hgff = $gurl) =~ s/;add=.*$//; # $hgff = " GFF"; # print "\n"; ($lqid,$lsid,$lstrand,$lstart,$lend)= ($qid,$sid,$s_strand,$s_start,$s_end); } } printHsp($lsid,$lqid,\@hsg,$hspid) if(@hsg); print "\n"; print "
Annotation methods
",join("", "query","bitscore","match link",@predcols), "

",join("", $qid, $bit_score, "$sid .. $hgff"), "
",join("", "query","bitscore","match link",@predcols), "
\n"; print "
\n" unless($novote); print $footer; print "\n"; sub amin { my($a,$b)= @_; $a= abs($a); $b= abs($b); return ($a < $b) ? $a : $b; } sub nearby { my($sb,$se,$lb,$le)= @_; my $dist= amin($sb - $le, $se - $lb); ##warn "nearby($sb,$se,$lb,$le) = $dist\n"; return ($dist <= $MAX_EXON_DIST); } sub printHsp { my($sid, $qid, $hsp,$hspid)= @_; my($sorg,$schr)= split(/[:|]/, $sid,2); $sorg =~ s/_.*$//; my($bstart,$bend,$bstrand); my $bits= 0; foreach my $p (@$hsp) { my($xqid, $xsid, $s_start, $s_end, $s_strand, $bit_score)= @$p; $bits += $bit_score; $bstrand= $s_strand; $bstart= $s_start if($bstart==0 || $bstart>$s_start); $bend= $s_end if($bend < $s_end); } return unless($bend); return if ($bits < $MIN_BITSCORE); my $gurl= gbrowseFromOrgChrStartStop($sorg, $schr, $bstart, $bend, $bstrand, $qid, $hsp); my $hmap = " $sid"; (my $hgff = $gurl) =~ s/;add=.*$//; $hgff = " GFF"; my @predvals= map { my $pgid=join("_",$_,$qid,$hspid); (my $v= $predval) =~ s/GENEID_GRPID/$pgid/; $v; } @preds; @predvals=() if ($novote); #print join("\t", $qid, $bit_score, $sid, $gurl), "\n"; print "",join("", $qid, $bits, "$hmap .. $hgff", @predvals), " \n"; } sub checkorg { my($inorg,$chr)= @_; $inorg= lc($inorg); if ($inorg eq 'dmel' && $chr =~ /h|Y|U/) { return 'dmelhet'; } else { return "$inorg-hsg"; } ## FIXME } sub orgmaps { my($inorg)= @_; return 1; # return $orgmaps{$inorg}; } sub gbrowseurl { my ($inorg,$chr)= @_; my $org= checkorg($inorg,$chr); # need all fasta header here - check for release= return undef unless($org); my $gbrowseurl= (orgmaps($org)) ? $gbrowsebaseurl."/$org/?" : ''; return $gbrowseurl; } sub gbrowseFromOrgChrStartStop { my ($inorg,$chr,$bstart,$bend,$bstrand, $qid, $parts) = @_; my $url=''; my $gbrowseurl= gbrowseurl($inorg,$chr); if ($bend && $gbrowseurl) { my($vstart,$vend)= ($bstart-$EXPAND,$bend+$EXPAND); my $bloc= "$bstart..$bend"; if(ref $parts) { ## gbrowse handle_quickie doesn't have strand field; can we swap start,end to show? ## only works with code changes to gbrowse (handle_quickie didnt handle strand) $bloc=""; my @parts= @$parts; @parts= reverse @parts if($bstrand eq '-'); foreach my $p (@parts) { my($qid, $sid, $s_start, $s_end, $s_strand, $bit_score)= @$p; $bloc .= ($bstrand eq '-') ? "$s_end-$s_start,": "$s_start-$s_end,"; } $bloc =~ s/,$//; } $url="${gbrowseurl}name=$chr:$vstart..$vend;add=$chr+protein_match+$qid+$bloc"; ## gbrowse } return $url; } # BEGIN{ $faprefix= 'gnl\|'; } sub cleanid { unless( $_[0] =~ s/gnl\|//) { $_[0] =~ s/^gi\|\d+\|(\S+)/$1/; } $_[0] =~ s/\|$//; ## ncbEC, ncbAt have '|' at end of id; chomp here? $_[0] =~ s/\|/:/; #?? change to db:id ? } __DATA__ # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 10 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp224 gnl|dyak_caf051213|chr2L 100.00 810 0 0 1 810 8255352 8256161 0.0 1606 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 9 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp157a gnl|dyak_caf051213|chr2R 100.00 453 0 0 1 453 10824668 10824216 0.0 856 yakuba_Acp157a gnl|dyak_caf051213|chr2R 89.07 375 38 1 60 431 10822736 10822362 2e-101 373 yakuba_Acp157a gnl|dyak_caf051213|chr2R 92.98 57 4 0 5 61 10822804 10822748 1e-13 81.8 yakuba_Acp157a gnl|dyak_caf051213|chr2R 95.83 48 2 0 12 59 10877209 10877256 5e-13 79.8 yakuba_Acp157a gnl|dyak_caf051213|chr2R 90.20 51 5 0 13 63 10825675 10825625 1e-07 61.9 yakuba_Acp157a gnl|dyak_caf051213|chr2R 91.89 37 3 0 27 63 10824032 10823996 4e-04 50.1 yakuba_Acp157a gnl|dsec_br051028|scaffold_1 92.31 52 4 0 10 61 8404252 8404201 1e-10 71.9 yakuba_Acp157a gnl|dsim_wu050602|chr2R 90.38 52 5 0 10 61 9650845 9650794 3e-08 63.9 yakuba_Acp157a gnl|dsim_wu050602|chr2R 91.11 45 4 0 10 54 9661482 9661438 2e-06 58.0 yakuba_Acp157a gnl|dmel_r4.1|2R 91.49 47 4 0 13 59 10518403 10518357 1e-07 61.9 yakuba_Acp157a gnl|dmoj_caf060210|scaffold_6473 100.00 24 0 0 428 451 7278682 7278705 0.002 48.1 yakuba_Acp157a gnl|dmoj_caf060210|scaffold_6473 100.00 23 0 0 429 451 13102764 13102786 0.007 46.1 yakuba_Acp157a gnl|dsim_wu050602|chr2L 100.00 23 0 0 30 52 14325791 14325813 0.007 46.1 yakuba_Acp157a gnl|dyak_caf051213|chr2L 96.30 27 1 0 425 451 7976826 7976800 0.007 46.1 yakuba_Acp157a gnl|dmel_r4.1|3R 100.00 23 0 0 429 451 7145382 7145360 0.007 46.1 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 8 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp225 gnl|dyak_caf051213|chr3R 100.00 480 0 0 1 480 25153955 25154434 0.0 952 yakuba_Acp225 gnl|dsec_br051028|scaffold_13 87.14 140 18 0 274 413 114481 114342 1e-29 135 yakuba_Acp225 gnl|dsec_br051028|scaffold_13 83.33 174 29 0 35 208 114716 114543 9e-24 115 yakuba_Acp225 gnl|dsim_wu050602|chr3R 86.29 124 17 0 274 397 21157755 21157632 1e-22 111 yakuba_Acp225 gnl|dsim_wu050602|chr3R 82.18 174 31 0 35 208 21157990 21157817 6e-19 99.6 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 7 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp134 gnl|dyak_caf051213|chr2R 100.00 240 0 0 1 240 10877210 10877449 1e-132 476 yakuba_Acp134 gnl|dyak_caf051213|chr2R 88.51 87 10 0 1 87 10822796 10822710 2e-17 93.7 yakuba_Acp134 gnl|dyak_caf051213|chr2R 95.74 47 2 0 1 47 10824656 10824610 1e-12 77.8 yakuba_Acp134 gnl|dyak_caf051213|chr2R 93.62 47 3 0 1 47 10825675 10825629 2e-10 69.9 yakuba_Acp134 gnl|dyak_caf051213|chr2R 90.91 44 4 0 4 47 10824043 10824000 4e-06 56.0 yakuba_Acp134 gnl|dmel_r4.1|2R 92.05 88 7 0 1 88 10518403 10518316 3e-25 119 yakuba_Acp134 gnl|dmel_r4.1|2R 86.76 68 9 0 7 74 2232318 2232251 1e-08 63.9 yakuba_Acp134 gnl|dere_caf060210|scaffold_4845 86.57 134 17 1 27 159 14960129 14960262 5e-24 115 yakuba_Acp134 gnl|dsec_br051028|scaffold_1 89.66 87 9 0 1 87 8404249 8404163 7e-20 101 yakuba_Acp134 gnl|dsim_wu050602|chr2R 88.04 92 11 0 1 92 9650842 9650751 4e-18 95.6 yakuba_Acp134 gnl|dsim_wu050602|chr2R 86.84 76 10 0 1 76 9661479 9661404 6e-11 71.9 yakuba_Acp134 gnl|dsim_wu050602|chr2R 87.30 63 8 0 26 88 9650082 9650020 6e-08 61.9 yakuba_Acp134 gnl|dsim_wu050602|chr2L 100.00 23 0 0 18 40 14325791 14325813 0.003 46.1 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 6 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp223 gnl|dyak_caf051213|chr2R 100.00 716 0 0 1 716 13569965 13569250 0.0 1378 yakuba_Acp223 gnl|dere_caf060210|scaffold_4845 83.63 226 37 0 193 418 9800836 9801061 2e-35 155 yakuba_Acp223 gnl|dere_caf060210|scaffold_4845 92.68 41 3 0 668 708 9801313 9801353 3e-06 58.0 yakuba_Acp223 gnl|dsec_br051028|scaffold_337 90.48 63 6 0 370 432 10179 10241 3e-12 77.8 yakuba_Acp223 gnl|dsec_br051028|scaffold_337 92.68 41 3 0 668 708 10417 10457 3e-06 58.0 yakuba_Acp223 gnl|dsim_wu050602|chr2R 92.11 38 3 0 671 708 14302146 14302183 2e-04 52.0 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 5 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp133 gnl|dyak_caf051213|chr2R 100.00 387 0 0 1 387 18644348 18643962 0.0 767 yakuba_Acp133 gnl|dsim_wu050602|chr2R 90.99 111 10 0 250 360 11561550 11561660 1e-31 141 yakuba_Acp133 gnl|dsim_wu050602|chr2R 84.00 125 20 0 26 150 11561321 11561445 4e-16 89.7 yakuba_Acp133 gnl|dsec_br051028|scaffold_1 90.35 114 11 0 247 360 10328583 10328696 5e-31 139 yakuba_Acp133 gnl|dsec_br051028|scaffold_1 84.13 126 20 0 25 150 10328356 10328481 1e-16 91.7 yakuba_Acp133 gnl|dmel_r4.1|2R 87.69 130 16 0 231 360 12452664 12452793 1e-28 131 yakuba_Acp133 gnl|dmel_r4.1|2R 87.38 103 13 0 25 127 12452456 12452558 1e-19 101 yakuba_Acp133 gnl|dere_caf060210|scaffold_4845 84.66 176 26 1 177 352 13018738 13018564 2e-27 127 yakuba_Acp133 gnl|dere_caf060210|scaffold_4845 86.29 124 15 1 2 123 13018914 13018791 7e-21 105 yakuba_Acp133 gnl|dere_caf060210|scaffold_4845 88.52 61 7 0 46 106 13020226 13020166 6e-09 65.9 yakuba_Acp133 gnl|dere_caf060210|scaffold_4845 87.72 57 7 0 173 229 13020099 13020043 2e-06 58.0 yakuba_Acp133 gnl|dere_caf060210|scaffold_4845 93.75 32 2 0 270 301 13020005 13019974 0.001 48.1 yakuba_Acp133 gnl|dmel_r4.1|2L 96.30 27 1 0 48 74 6260730 6260704 0.006 46.1 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 4 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp158 gnl|dyak_caf051213|chr2R 100.00 450 0 0 1 450 18643120 18642671 0.0 850 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 3 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Gene144 gnl|dyak_caf051213|chr2L 100.00 265 0 0 1 265 15682865 15683129 1e-147 525 yakuba_Gene144 gnl|dere_caf060210|scaffold_4690 100.00 23 0 0 162 184 3559392 3559414 0.004 46.1 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: 2 sequences # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score yakuba_Acp157 gnl|dyak_caf051213|chr2R 100.00 450 0 0 1 450 10822788 10822339 0.0 850 yakuba_Acp157 gnl|dyak_caf051213|chr2R 89.07 375 38 1 53 427 10824609 10824238 2e-101 373 yakuba_Acp157 gnl|dyak_caf051213|chr2R 87.34 79 10 0 1 79 10877218 10877296 2e-12 77.8 yakuba_Acp157 gnl|dyak_caf051213|chr2R 95.12 41 2 0 1 41 10825667 10825627 7e-09 65.9 yakuba_Acp157 gnl|dyak_caf051213|chr2R 94.87 39 2 0 3 41 10824036 10823998 1e-07 61.9 yakuba_Acp157 gnl|dyak_caf051213|chr2R 94.29 35 2 0 7 41 10824642 10824608 3e-05 54.0 yakuba_Acp157 gnl|dere_caf060210|scaffold_4845 84.96 113 17 0 12 124 14960122 14960234 5e-16 89.7 yakuba_Acp157 gnl|dsec_br051028|scaffold_1 88.89 72 8 0 1 72 8404241 8404170 5e-13 79.8 yakuba_Acp157 gnl|dsim_wu050602|chr2R 87.50 72 9 0 1 72 9650834 9650763 1e-10 71.9 yakuba_Acp157 gnl|dmel_r4.1|2R 84.81 79 12 0 1 79 10518395 10518317 1e-07 61.9 # BLASTN 2.2.11 [Jun-05-2005] # Query: yakuba_Acp224 ; dyak/chr2L:8255555..8256022; CG32354_o2 # Database: dmel-chromosome dyak-chromosome dsim-chromosome dsec-chromosome dana-chromosome dere-chromosome dmoj-chromosome # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score # Query: erecta_Acp100 # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score erecta_Acp100 gnl|dere_caf060210|scaffold_4929 100.00 630 0 0 1 630 7281653 7282282 0.0 1249 erecta_Acp100 gnl|dyak_caf051213|chr2L 90.48 63 6 0 1 63 19792359 19792297 3e-12 77.8 erecta_Acp100 gnl|dyak_caf051213|chr2L 93.55 31 2 0 586 616 19791537 19791507 0.010 46.1 erecta_Acp100 gnl|dsim_wu050602|chrX_random 96.30 27 1 0 465 491 5369049 5369023 0.010 46.1 erecta_Acp100 gnl|dyak_caf051213|chrX 96.30 27 1 0 465 491 19975908 19975882 0.010 46.1 erecta_Acp100 gnl|dmel_r4.1|X 96.30 27 1 0 465 491 20699782 20699756 0.010 46.1