#!/usr/bin/perl # tilexscoregene.pl =head1 ABOUT tilexscoregene.pl - This program pastes tile array signal scores onto transfrag or gene predictions. Transfrag or other transcript input is GFF v3 format. TIle signal scores overlapping input are in various BED-like location/score table formats defined by experimental data. Subs affy_grfile() and Manak_grfile() read two of these. It will need adapting to local file sets. Signal scores are read in, with assumption of several treatment group signals/tile location. GFF features are then read, and overlapping tile scores are added, averaged per feature unit (exon, transfrag, etc.). Output is a table of gene/exon/transfrag ID, followed by columns of ave score per treatment. =head1 SYNOPSIS perl tilexscoregene.pl sig_dmelchr4/*sig.gr dmel5chr4-genes.gff [or dmel5chr4-genes.exons] =head1 TODO Please read the source code and adapt to your needs. - option to change to per-base score; dont want to confuse tiny/big exons - signal is location, score/tile (fixed length), now fixed at 35base width. * option to output BED format scores (ref,start-1,end,score) for bed2wiggle to display on a per-exon or per-gene (and per-intron) - need to add headers that group by chrom, treatment perl tilexscoregene.pl -format bed -exon -chr=$chr \ $sc/dmel5/modenc/38bp-arrays/signal-graphs/gr_*_chr${chr}_0?sig.gr.bz2\ dmel5.exons > dmel5chr$chr-exontileaf.bed =head1 source data sets (local copy) $sc/dmel5/modenc/Manak_et_al/sigr/Dro_Total_AS_*_B1.$chr.sig.gr.bz2 $sc/dmel5/modenc/38bp-arrays/signal-graphs/gr_*_${chr}_0?sig.gr.bz2 =head1 AUTHOR don gilbert, gilbertd@indiana.edu, may 2008 =cut use strict; use File::Basename; use Getopt::Long; my $BIN=1000; # change to 500? my $INOFF=150; # change to 100? my $TILEW= 35; # FIXME my $pctover= 0.8; # was 0.8 my $span=0; my $debug=1; my $idpatt='Parent'; my $chr=""; my $exonscore=0; my(%gt, %gr, %genes, %scores, %genelocs, $format); my $dolog=0; my $optok= Getopt::Long::GetOptions( 'chr|ref=s' => \$chr, 'TILEW=i' => \$TILEW, 'exonscore!' => \$exonscore, # or scoretype=gene|exon|... 'format=s' => \$format, 'debug!' => \$debug, ); die "usage: tilexscoregene.pl -chr 4 sig_dmelchr4/*sig.gr dmel5-genes.gff options: -tile $TILEW -[no]exons -format table|bed|..." unless($optok); $format ||= "table"; # allow .gr.bz2 input # .sig.gr.bz2 my @sigr = grep(/\.gr/, @ARGV); #Dro_Total_AS_10_B1*.sig.gr and gr_564_Dro2_AS_GM2_chr4_0.sig.gr my @gff = grep(/\.(gff|exons)/, @ARGV); #? add -gff file option? my (@signa, @sigv, @sigloc, %sigbin, @siginfo); # from readsigs readsigs(); my @gt= @signa; # old# my @gt= sort keys %gt; foreach my $gff (@gff) { open(G,$gff); # stdin, .gz opts? my (@lv,@nv,@mv,%exonids); while(){ next unless(/^\w/ and /\t(exon)/); #? mRNA| chomp; @lv= @mv; @mv= @nv; @nv= split"\t"; next unless(@mv); my($ref,$src,$typ,$gb,$ge)= @mv; next if($chr && $chr ne $ref); # add ref $ref filter ... if($src =~ m/FlyBase/){ $idpatt= 'ID'; } else { $idpatt='Parent'; } # fixme; option $mv[-1] =~ m/$idpatt=([^;\s]+)/; my $ids=$1; # may be id,list next if($ids =~ m/\.t[2-9]/); # FIXME: augustus alttr # should filter out 2ndary mRNA's from here, keep only 1/gene my @ids= (split ",",$ids); foreach my $id (@ids) { $exonids{$id}++; } if($exonscore) { # add exon.num for idpatt == Parent #foreach (@ids) { my $xi= $exonids{$_}; $_= "$_:$xi" if($xi>1); } foreach (@ids) { my $xi= $exonids{$_}; $_= "$_:$xi" if($idpatt eq 'Parent'); } } else { foreach (@ids) { s/[:-].*$//; } # exon part } foreach my $id (@ids) { $genes{$id}++; $genelocs{$id}= "$gb\t$ge"; } my $nb= ($nv[0] eq $ref) ? $nv[3] : $ge + $INOFF; my $le= ($lv[0] eq $ref) ? $lv[4] : 0; my %didtile=(); foreach my $bin ( int(($gb-$INOFF)/$BIN) .. int(($ge+$INOFF)/$BIN) ) { $sigbin{$bin} or next; my @binv= @{ $sigbin{$bin} }; # my @binloc= map{ $sigloc[$_] } @binv; foreach my $binv (@binv) { # $itile next if($didtile{$binv}++); my $tb= $sigloc[$binv]; my $te= $tb + $TILEW; next if($te < $gb-$INOFF || $tb > $ge+$INOFF); # fixme, check -/+ INOFF isnt in near other exon my $inbefore= _max($gb-$INOFF, $le+1); my $inend= _min($ge+$INOFF, $nb-1); my($xover, $inover)=(0,0); $xover= (overlaps( $gb,$ge,$tb,$te)) ? 1 : 0; if(overlaps( $inbefore,$gb-1,$tb,$te)) { $inover=1; } elsif(overlaps( $ge+1,$inend,$tb,$te)) { $inover=1; } for(my $igroup=0; $igroup <= $#signa; $igroup++) { my $gt= $signa[$igroup]; my $tv= $sigv[$binv][$igroup]; if($xover) { score(\@ids,$gt,$tv,"exon"); } if($inover) { score(\@ids,$gt,$tv,"noex"); } } } } } close(G); } if($format =~ /bed/i) { outbed(); } else { outscore(); } #....................................................... sub outscore { # inputs: @gt == @signa, %genes ## try2: want full exon sum of scores : long exons w/ consistent on-score my $k=4; # debug; drop all but gene score [0] ? # my $k=2; # keep exon/noex scores my $kf= join("\t",("%.3g") x $k); my @gtk= map{ ($_) x $k } @gt; print join("\t","gene",@gtk),"\n"; foreach my $id (sort keys %genes) { print "$id"; foreach my $gt (@gt) { my @sc= genescore($id,$gt); printf "\t$kf", @sc[0..$k-1]; } print "\n"; } } sub headbed { my($chr,$treat,$ftype,$info)= @_; if(ref $info) { # my $track="track type=wiggle_0 name=\"$sna\" description=\"signal.gr,id:$sid,bw:$bw,log:$dolog\""; my $track= $info->{track}; $track =~ s/name=\"/name="$ftype-/ if $ftype; return $track; } my $nm= join("_",$treat,$chr,$ftype); return "track type=wiggle_0 name=\"$nm\" description=\"tr:$treat, type:$ftype, chr:$chr\""; } sub _sort_geneloc { my($starta)= split "\t", $genelocs{$a}; my($startb)= split "\t", $genelocs{$b}; return $starta <=> $startb or $a cmp $b; } sub outbed { # @gt == @signa == treatment group names my @gid= sort _sort_geneloc keys %genes; # should sort by location not ID foreach my $exin (0..1) { my $ftype = ($exin == 0) ? "exon" : "intron"; # write bed track header w/ chrom, exin type, treatment. foreach my $isig (0..$#signa) { my $treat= $signa[$isig]; my $info = $siginfo[$isig]; my $head= headbed($chr,$treat,$ftype,$info); print $head,"\n"; foreach my $gid (@gid) { my($start,$stop)= split "\t", $genelocs{$gid}; my @exinscore= genescore($gid,$treat); my $score= sprintf "%.3g", $exinscore[$exin]; # add ID at end; not used by bed processor ?? print join("\t",$chr,$start-1,$stop,$score, $gid),"\n" if $stop; } } } } sub readsigs { # fill/return @signa[0..nfile]= groupname; # @sigv[0..ntile][0..nfile] == sigvalues/group # @sigloc[0..ntile] = loc # %sigbin{0..nbins} = [tileloc1,tloc2,...]; @signa = ("noname") x scalar(@sigr); @siginfo = (undef) x scalar(@sigr); # foreach my $file (@sigr) for (my $igroup=0; $igroup <= $#sigr; $igroup++) { $dolog=0; # global now my $file= $sigr[$igroup]; (my $gname= $file) =~ s/.sig.gr.*$//; #... more parse : get chr# my $info= { source => $gname }; if($file =~ /Dro_Total_AS.+gr.*$/) { $dolog=1; $info= Manak_grfile($file, $span); } # gr_...sig.gr elsif($file =~ /\.gr.*$/) { $dolog=1; $info= affy_grfile($file, $span); } # gr_...sig.gr $gname= $info->{source}; $signa[$igroup]= $gname; $siginfo[$igroup]= $info; my $ok=0; if($file =~ /\.bz2$/) { $ok= open(S,"bzcat $file|"); } elsif($file =~ /\.gz$/) { $ok= open(S,"gunzip -c $file|"); } else { $ok= open(S,$file); } my $itile=0; while(){ chomp; my($loc,$v)=split "\t"; my $bin= int($loc/$BIN); $v= log_value($v) if ($dolog); $sigv[$itile][$igroup]= $v; if($igroup == 0) { $sigloc[$itile]= $loc; push( @{ $sigbin{$bin} }, $itile); # oops; do only for 1 group } $itile++; } close(S); } } # sub readsigs_OLD { # foreach my $file (@sigr) { # my $dolog=0; # (my $gt= $file) =~ s/.sig.gr.*$//; #... more parse : get chr# # if($file =~ /Dro_Total_AS.+gr.*$/) { $dolog=1; $gt= Manak_grfile($file, $span); } # gr_...sig.gr # elsif($file =~ /\.gr.*$/) { $dolog=1; $gt= affy_grfile($file, $span); } # gr_...sig.gr # # $gt{$gt}++; # my $ok=0; # if($file =~ /\.bz2$/) { $ok= open(S,"bzcat $file|"); } # elsif($file =~ /\.gz$/) { $ok= open(S,"gunzip -c $file|"); } # else { $ok= open(S,$file); } # while(){ # chomp; my($loc,$v)=split "\t"; # my $bin= int($loc/$BIN); # $v= log_value($v) if ($dolog); # push( @{$gr{$bin}{$gt}}, "$loc\t$v" ); # #? redo this as (1) fixed size array r-tiles x c-groups, (2) one bin > tile hash # } # close(S); # } # } sub genescore { my($id,$treat)= @_; my @ex= exgroupscore($id,$treat,"exon"); # ($n,$mn,$sd,$sem) my @nx= exgroupscore($id,$treat,"noex"); return ($ex[1], $nx[1], $ex[3], $nx[3]); # test # sem (ex,nx 3) or sd don't appear useful to discriminate good/bad expression # intron score nx might be helpful, not clear; ex[1] is only truly useful score #return ($ex[1] - $nx[1], $ex[3] + $nx[3]); # debug test #return ($ex[1]/($ex[3]||1), $nx[1]/($nx[3]||1)); # debug test # # drop this derived score as too ambiguous; doesnt discriminate good/bad expr # if(@ex and @nx) { # my $md = $ex[1] - $nx[1]; # my $sem= $ex[3] + $nx[3]; #?? # $sem=1 unless($sem); # return (abs($md)>$sem) ? ($md, $sem) : (0,0); # # return ($md/$sem, $md, $sem); #? # } return (0,0,0); } sub exgroupscore { my($id,$treat,$grp)= @_; my($n,$mn,$sd,$sem)= (0)x10; my @v= split " ", $scores{$id}{$treat}{$grp}; # print STDERR "sc: $id.$treat.$grp: @v\n" if $debug; $n= @v; return if($n<1); # this gives 0's for empty tiles; do we need NA instead? # return qw(0 NA NA NA) if($n<0); my($sv,$ss)=(0,0); foreach my $v (@v) { $sv+= $v; $ss+= $v * $v; } $mn= $sv/$n; $sd= ($n<2) ? 0 : sqrt($ss - $sv*$mn/($n-1)); # $sem= $sd/sqrt($n); return($n,$mn,$sd,$sv); # replaced sem w/ sv } sub score { my($ids,$treat,$tv,$grp)= @_; foreach my $id (@$ids) { $scores{$id}{$treat}{$grp} .= "$tv "; } } # convert to array handling? _max(@list) sub _min { return ($_[1] < $_[0]) ? $_[1] : $_[0]; } sub _max { return ($_[1] > $_[0]) ? $_[1] : $_[0]; } sub overlaps { my( $lb,$le,$tb,$te)= @_; my $over= ($tb <= $le && $te >= $lb) ? 1 : 0; if($over) { my ($bb,$be)= ( _max($tb,$lb), _min($te,$le) ); my $maxo= abs($be - $bb); my $leno= _min( abs($le - $lb), abs($te - $tb)) || 1; my $pover= $maxo/$leno; $over = 0 if $pover < $pctover; } return $over; } sub log_value { my $v= shift; my $s=($v<0)? -1:1; $v= abs($v); $v=($v<1)?0:log($v); return $s * $v; } sub affy_grfile { my($fn, $span)= @_; $span ||= 38; my $bn= basename($fn); print STDERR "affy signal span:$span, file: $bn\n" if $debug; unless ($bn =~ m/gr_(\d+)_(\S+)_(\w+)_(\d+)[\._]sig\.gr/) { die "affy_grfile: cant parse data from filename: $bn"; # return {}; } my $sid=$1; my $sna=$2; my $chr=$3; my $bw= $4; my $source= join("_","af",$bw,$sid); #sna ? # return $source; # add below for optional .bed output my $track="track type=wiggle_0 name=\"$sna\" description=\"signal.gr,id:$sid,bw:$bw,log:$dolog\""; my $method= "microarray_oligo"; # default my $trackname= 'track'.($sid-1); return { wformat => "bed", source => $source, method => $method, track => $track, trackname=>$trackname, sid => $sid, chrom => $chr, bandwidth => $bw }; } sub Manak_grfile { my($fn, $span)= @_; $span ||= 35; my $bn= basename($fn); print STDERR "Manak_grfile span:$span, file: $bn\n" if $debug; unless ($bn =~ m/(\w+)_(\w+)_B1\.(\w+)[\._]sig\.gr/) { die "affy_grfile: cant parse data from filename: $bn"; # return {}; } my $sna=$1; my $sid=$2; my $chr=$3; my $bw= 50; ## $4; my $source= join("_","mk",$bw,$sid); # return $source; # track type=wiggle_0 name="myName" description="myDescript" my $track="track type=wiggle_0 name=\"$sna\" description=\"signal.gr,id:$sid,bw:$bw,log:$dolog\""; my $method= "microarray_oligo"; # default my $trackname= 'track'.($sid-1); return { wformat => "bed", source => $source, method => $method, track => $track, trackname => $trackname, sid => $sid, chrom => $chr, bandwidth => $bw }; }