#!/usr/bin/perl BEGIN { %maingrep = ( dana => '(scaffold_13340|scaffold_13337|scaffold_13266|scaffold_12916|scaffold_13417|scaffold_13117|scaffold_12911|scaffold_12943|scaffold_13248|scaffold_13335|scaffold_12929|scaffold_13047|scaffold_13334)', #" " dere => '(scaffold_4929|scaffold_4784|scaffold_4845|scaffold_4690|scaffold_4770|scaffold_4820|scaffold_4644)', #" " dgri => '(scaffold_15110|scaffold_15245|scaffold_15252|scaffold_14906|scaffold_15203|scaffold_14853|scaffold_15126|scaffold_15074|scaffold_14830|scaffold_15112|scaffold_15081)', #" " dmoj => '(scaffold_6500|scaffold_6496|scaffold_6680|scaffold_6654|scaffold_6540|scaffold_6473|scaffold_6359|scaffold_6328|scaffold_6308)', #" " dper => '(scaffold_0|scaffold_1|scaffold_2|scaffold_3|scaffold_4|scaffold_5|scaffold_6|scaffold_7|scaffold_8|scaffold_9|scaffold_10|scaffold_11|scaffold_12|scaffold_13|scaffold_14|scaffold_15|scaffold_16|scaffold_17|scaffold_18|scaffold_19|scaffold_20|scaffold_21|scaffold_22|scaffold_23|scaffold_24)', #" " dsec => '(scaffold_0|scaffold_1|scaffold_2|scaffold_3|scaffold_4|scaffold_5|scaffold_6|scaffold_7|scaffold_8|scaffold_9|scaffold_10|scaffold_11|scaffold_12|scaffold_13|scaffold_14|scaffold_15|scaffold_16|scaffold_17|scaffold_18|scaffold_19)', #" " dvir => '(scaffold_13049|scaffold_12875|scaffold_12963|scaffold_13047|scaffold_12970|scaffold_12855|scaffold_12928|scaffold_12723|scaffold_13042|scaffold_12822|scaffold_12726|scaffold_13246|scaffold_12823|scaffold_12932)', #" " dwil => '(scaffold_181130|scaffold_180708|scaffold_181096|scaffold_181089|scaffold_180698|scaffold_180772|scaffold_180700|scaffold_180949|scaffold_181141|scaffold_181150|scaffold_180777|scaffold_181108|scaffold_180702|scaffold_180697|scaffold_180764|scaffold_180703|scaffold_180701|scaffold_181009|scaffold_181009|scaffold_180955|scaffold_180745|scaffold_180727|scaffold_180916|scaffold_180699)', #" " dsim => '-(.*random|chrU)', #" -v " dyak => '-(.*random|chrU)', #" -v " dpse => '-(.*random|U|Unknown.*)', #" -v " dmel => '-(.*random|U.*)', dpulex => '#150', ## dpulex => scaffold # < 150 ); } ## input is dspp*.fa.count ; make sums for main / small scaffolds #seq len A C G T N cpg sub facountMain { ##my $spp= $ENV{dp} || $ENV{species}; my @inf= @ARGV; foreach $inf (@inf) { my ($spp)= $inf =~ m,/(d...)[^/]+$,; open(IN,$inf); my(@mains, @alts); while() { chomp; $head= $_ if (/^#seq/); next unless(/^\w/); if(/^total/) { $total=$_; next; } my @v= split; my ($srcid,$l,$a,$c,$g,$t,$n,$cpg)= @v; my $grep= $maingrep{$spp} || ""; my $main=1; $srcid =~ s/super_/scaffold_/; if ($grep && $srcid) { if($grep =~ s/^\-//) { $main= ($srcid =~ m/^$grep$/) ? 0 : 1; } elsif($grep =~ s/^\#//) { my $nscaf= int($grep); my ($idn)= ($srcid =~ m/(\d+)/); $main = ($idn > $nscaf) ? 0 : 1; } else { $main = ($srcid =~ m/^$grep$/) ? 1 : 0; } } if($main) { $mains[0]++; foreach my $i (1..$#v){ $mains[$i] += $v[$i] ;} } else { $alts[0]++; foreach my $i (1..$#v) { $alts[$i] += $v[$i] ; } } } close(IN); print "\n#species: $spp facount by main,small scaffolds\n"; $head =~ s/#seq/#nscaf/; # add cols: %GC, %Ns, %total $head .= "\t%GC\t%Ns\t%total"; @tot= split("\t",$total); $gc= sprintf "%5.3f",($mains[3]+$mains[4]) / ($mains[1]-$mains[6]); $ns= sprintf "%5.3f",$mains[6] / $mains[1]; $tp= sprintf "%5.3f",$mains[1] / $tot[1]; print $head,"\n"; print join("\t",@mains,$gc,$ns,$tp),"\tmain_scaffolds\n"; $gc= sprintf "%5.3f",($alts[3]+$alts[4]) / ($alts[1]-$alts[6]); $ns= sprintf "%5.3f",$alts[6] / $alts[1]; $tp= sprintf "%5.3f",$alts[1] / $tot[1]; print join("\t",@alts,$gc,$ns,$tp),"\tsmall_scaffolds\n"; print $total,"\n"; print "#","-"x70,"\n"; } } facountMain(); =item count main genes / scaff type foreach ds ( $dpp ) setenv dp $ds ; gzgrep modDM $sc/caf1a/dgil/$dp*-hsp.gff.gz | grep '_G1' | \ perl -Mmainscaf -ne\ '($r)=split; ($g)=m/Parent=(\w+)/; next if($g eq $lg); $lg=$g; $spp=$ENV{dp}; \ if(mainscaf::mainscaf($spp,$r)){$mg++;}else{$ag++;} \ END{print "$spp genes: main=$mg; altscaf=$ag\n";}' end # Dmel primary gene matches on main, small scaffolds: dana dmelgene: main=12563; altscaf=519 dere dmelgene: main=13222; altscaf=109 dgri dmelgene: main=11667; altscaf=956 dmel dmelgene: main=13401; altscaf=0 dmoj dmelgene: main=12424; altscaf=264 dper dmelgene: main=9955; altscaf=2873 dpse dmelgene: main=12719; altscaf=165 dsec dmelgene: main=11506; altscaf=1859 dsim dmelgene: main=12115; altscaf=1184 dvir dmelgene: main=12109; altscaf=603 dwil dmelgene: main=11939; altscaf=854 dyak dmelgene: main=12985; altscaf=349 # Dmel dna align foreach ds ( $dpp ) setenv dp $ds ; gzcat $sp/web/data/$dp/gff/$dp*-dmel-algn.gff.gz | \ perl -Mmainscaf -ne\ '($r,$s,$t,$b,$e)=split; $eb=$e-$b+1; ($a)=m/align=(\d+)/; $a=$eb if($eb<$a); $spp=$ENV{dp}; \ if(mainscaf::mainscaf($spp,$r)){$ma+=$a;}else{$sa+=$a;} \ END{print "$spp dna-align: main=$ma; altscaf=$sa\n";}' end # Dmel dna match on main, small scaffolds dana dna-align: main=14627283; altscaf=3244967 dere dna-align: main=45549994; altscaf=3148880 dgri dna-align: main=6410111; altscaf=1169329 dmoj dna-align: main=7441223; altscaf=747199 dper dna-align: main=9771233; altscaf=3955964 dpse dna-align: main=12081282; altscaf=668957 dsec dna-align: main=41082874; altscaf=17781458 dsim dna-align: main=44822830; altscaf=6016134 dvir dna-align: main=7365978; altscaf=1494769 dwil dna-align: main=6979598; altscaf=1758918 dyak dna-align: main=45295152; altscaf=7926656 =cut