#!/usr/bin/perl # affysig2wig.pl =item affysig2wig.pl #....... test direct wiggle data file creation 1. simple: affy signal gr files (= fixed width, start,score files) set signame=gr_731_Dro2_AS_Kc167_RWP+_chr4_0_sig bzcat sig731/$signame.gr.bz2 | perl -I$men $men/affysig2wig.pl $signame 38 -2000 5000 -- fix up as replacement for wiggle[23]gff3.pl -- also write summary .gff for .wig files produced: chr,source,type,start,stop version w/ options, many sig input files: # affy scores: -tile 38 -min=-2000 -max=5000 set sgdir1=$sc/dmel5/modenc/38bp-arrays/signal-graphs/ set sgdir2=$sc/dmel5/modenc/38bp-aug08/signal-graphs/ # for Manak data, tile=36 min=-50 max=200 # most scores in -10 .. 30 range ; log would be good as w/ affy set sgdir3=$sc/dmel5/modenc/Manak_et_al/sigr/ ** use -type afsig_region instead of plain region to separate displays perl -I$men $men/affysig2wig.pl -debug -type afsigregion -tile=38 -min=-2000 -max=5000 \ -gff=drosmelme/c4affy2.gff -dir drosmelme/c4affy2 \ $sgdir2/gr*chr4_0?sig.gr.bz2 # -log or not ? # -source= -type= -debug ** log of this data does not look better; it weights the small values near to the big ones, so seems to loose resolution. Possibly tile statistics, some sensible transform of the data would work better. 2. less simple: nimblegen overlapped tiles. See this nimgen2gff.pl convert gff_bed to write directly .wig files for gbrowse; set or choose tile-subdiv size (fixed?) even tho nimben tiles range in size (~ 45 .. 65 bp) =cut # use lib('/bio/argos/work/gbrowse/Generic-Genome-Browser-1.69/lib'); #debug use strict; use Getopt::Long; use File::Basename; use File::Spec::Functions qw/ catdir catfile /; use POSIX; use Bio::Graphics::Wiggle; my $dolog= 0; my $sotype="region"; # "microarray_oligo"; #"transcribed_region"; my $source="affysig"; #?? my ($seqid,@sigin,$gffout,$dirout,$tilewidth,$minsig,$maxsig,$stepspan,$debug); my $optok= GetOptions( # "id|name|seqid=s",\$seqid, # drop "output|gff=s",\$gffout, "dirout=s",\$dirout, "input|sigin=s",\@sigin, "tilewidth|span=s",\$tilewidth, "stepspan!", \$stepspan, # special flag for wig, create compressed step==span data "minscore=s",\$minsig, "maxscore=s",\$maxsig, "logscore!",\$dolog, "source=s",\$source, "type=s",\$sotype, # gff summary output "debug!",\$debug, ); die " usage: affysi2wig -tilewidth 38 -minscore -1000 -maxscore 2000 gr*sig.gr.bz2 writes .wig files per input sig.gr with summary track.gff using Bio/Graphics/Wiggle options: -[no]logscore -gff affy-tracks.gff -type $sotype -source $source [replaced by filename info] " unless($optok and $tilewidth and $maxsig); # my $today= POSIX::strftime("%Y%m%d",localtime( $^T)); sub filedate { my $fp= shift; my $ftime= $^T; $ftime -= 24*60*60*(-M $fp) if $fp; # 0 if not exist; default today return POSIX::strftime("%Y%m%d", localtime( $ftime )); } die "Output directory $dirout doesn't exist" if($dirout && ! (-d $dirout)); # or mkdir($dirout)? push(@sigin, @ARGV); # all non options are input files? if($dolog) { $minsig= log_of($minsig); $maxsig= log_of($maxsig); } my (@trackgff); my $gffouth= *STDOUT; if($gffout && $gffout !~ /^(stdout|\-)/) { open(GFFOUT,">$gffout") or die "$gffout"; $gffouth= *GFFOUT; } print $gffouth "##gff-version 3\n"; print $gffouth "# affysig2wig date=",filedate(),"\n"; print $gffouth "# tilewidth=$tilewidth, minscore=$minsig, maxscore=$maxsig, log=$dolog\n"; print $gffouth "\n"; foreach my $sigin (@sigin) { my $gname= $sigin; unless($gname =~ s/.sig.gr.*$//) { } # what? my $info= { source => $gname }; if($sigin =~ /Dro_Total_AS.+gr/) { $info= Manak_grfile($sigin, $tilewidth); } # gr_...sig.gr elsif($sigin =~ /sig\.gr/) { $info= affy_grfile($sigin, $tilewidth); } # gr_...sig.gr ## elsif($sigin =~ /Dro_Total_AS.+bed$/) { $info= Manak_transfrag($sigin); } ## elsif($sigin =~ /bed$/) { $info= affy_transfrag($sigin); } # tf_...bed else { warn "dont know that input format: $sigin\n"; next; } # or die? my $ok=0; local *SIGIN; if($sigin =~ /\.bz2$/) { $ok= open(SIGIN,"bzcat $sigin|"); } elsif($sigin =~ /\.gz$/) { $ok= open(SIGIN,"gunzip -c $sigin|"); } else { $ok= open(SIGIN,$sigin); } unless($ok) { warn "cant open $sigin\n"; next; } # or die my $seqid= $info->{'trackname'} || $gname; my $wigout= "$seqid.wig"; $wigout = catfile( $dirout, $wigout) if $dirout; warn "# write to $wigout\n" if $debug; my $result= affy2wig( *SIGIN, $wigout, $seqid, $tilewidth, $minsig, $maxsig, $dolog); close(SIGIN); my $gffline= join("\t", $info->{'chrom'}||"ref", $info->{'source'}||$source, $info->{'method'}||$sotype, $result->{'start'}||0, $result->{'stop'}||0, ".",".",".", join(";","tid=$seqid","Name=".$info->{'name'}, "wigfile=$wigout","infile=".basename($sigin)) ); push(@trackgff, $gffline); # or write to stdout now ? print $gffouth $gffline,"\n"; } # close($gffouth); # put_summarygff(); # end #................................................................ # do this per input file sub affy2wig { my($sigin, $wigfile, $seqid, $tilewidth, $minsig, $maxsig, $dolog )= @_; # ** use of step instead of span is wrong here; data is not located precisely # but in modulus of step size; obvious at resolution near step/span sizes. # change tilewidth to wiggle.span option; my $writeable=1; my $step= ($stepspan) ? $tilewidth : 1; # testing; need to fix wiggle.pm to use well my $wig = Bio::Graphics::Wiggle->new( $wigfile, $writeable, { seqid => $seqid, step => $step, span => $tilewidth, # was step => $tilewidth, min => $minsig, max => $maxsig, }); my($chrstart,$chrstop) = (undef) x 2; while(<$sigin>) { next unless(/^\d/); chomp; my ($start,$value) = split /\s+/ or next; $start ++; # this is 0-based locations?? # melon.% bzcat $sgdir2/gr_731_Dro2_AS_Kc167_RWP+_chr3RHet_0_sig.gr.bz2 | more # 0 -86.25 << # 43 2534.25 $value= log_of($value) if $dolog; $wig->set_value($start => $value); # store $value at position my $end= $start + $tilewidth; $chrstop= $end if $chrstop < $end; $chrstart= $start unless(defined $chrstart and $chrstart < $start); } # $wig doesnt have a file close method undef $wig; # should close it? return { start => $chrstart, stop => $chrstop }; # what else? } sub log_of { my($v)=shift; return ($v >= 1) ? log($v) : ($v <= -1) ? -log(-$v) : 0; } sub affy_grfile { my($fn, $span)= @_; my $bn= basename($fn); my $date= filedate($fn); print STDERR "# affy signal span:$span, file: $bn\n" if $debug; unless ($bn =~ m/gr_(\d+)_(\S+)_(\w+)_(\d+)[\._]sig\.gr/) { die "affy_grfile: cant parse data from filename: $bn"; # return {}; } my $sid=$1; my $sna=$2; my $chr=$3; my $bw= $4; $chr =~ s/^chr//; # change to std name unless($span) { die "affy_grfile: need option -span N for tile width"; } # $sid= sprintf "%03d", $sid unless($sid =~ /\D/); # dont need here # for tracking stick sid on sna $sna =~ s/[\W]/_/g; $sna= join("_","af".$sid,$sna); # track type=wiggle_0 name="myName" description="myDescript" my $wformat="variableStep chrom=$chr span=$span"; my $track="track type=wiggle_0 name=\"$sna\" description=\"signal.gr,id:$sid,bw:$bw,log:$dolog\""; my $source= join("","afsig",$bw,"g",$sid); my $method= $sotype; # "microarray_oligo"; # default #my $trackname= join("_",$source,$chr,$date); #?? do we need a 'trackNNN' prefix; is gbrowse software looking for this? # gbrowse wiggle loader uses dots; trackNNN.xxx.wig may be significant my $trackname= join(".",'track'.$sid,"afsig",$chr,$date); return { wformat => $wformat, source => $source, method => $method, track => $track, trackname => $trackname, name => $sna, # note => "signal.gr,id:$sid,bw:$bw,log:$dolog", sid => $sid, chrom => $chr, bandwidth => $bw }; } =item Manak_grfile melon.% ls Dro_Total_AS_1_B1 Dro_Total_AS_1_B1.chr2L.sig.gr Dro_Total_AS_1_B1.chr3RHet.sig.gr Dro_Total_AS_1_B1.chr2LHet.sig.gr Dro_Total_AS_1_B1.chr4.sig.gr .. melon.% ls $msg Dro_Total_AS_10_B1.zip Dro_Total_AS_2_B1.zip Dro_Total_AS_6_B1.zip index.html =cut sub Manak_grfile { my($fn, $span)= @_; my $bn= basename($fn); my $date= filedate($fn); print STDERR "# Manak_grfile span:$span, file: $bn\n" if $debug; unless ($bn =~ m/(\w+)_(\w+)_B1\.(\w+)[\._]sig\.gr/) { die "affy_grfile: cant parse data from filename: $bn"; # return {}; } my $sna=$1; my $sid=$2; my $chr=$3; my $bw= 50; ## fixed bandwidth here $chr =~ s/^chr//; # change to std name $sid= sprintf "%03d", $sid unless($sid =~ /\D/); # format for alpha sort 01,02, .. 10,11,12 # for tracking stick sid on sna $sna =~ s/[\W]/_/g; $sna= join("_","mk".$sid,$sna,"B1"); unless($span) { die "Manak_grfile: need option -span N for tile width"; } # track type=wiggle_0 name="myName" description="myDescript" my $wformat="variableStep chrom=$chr span=$span"; my $source= join("","mksig",$bw,"g",$sid); my $method= $sotype; # "microarray_oligo"; # default my $track="track type=wiggle_0 name=\"$sna\" description=\"signal.gr,id:$sid,bw:$bw,log:$dolog\""; # my $trackname= join("_",$source,$chr,$date); #?? do we need a 'trackNNN' prefix; is gbrowse software looking for this? # gbrowse wiggle loader uses dots; trackNNN.xxx.wig may be significant in gbrowse usage my $trackname= join(".",'track'.$sid,"mksig",$chr,$date); return { wformat => $wformat, source => $source, method => $method, track => $track, trackname => $trackname, name => $sna, # note => "signal.gr,id:$sid,bw:$bw,log:$dolog", sid => $sid, chrom => $chr, bandwidth => $bw }; } # sub nimblegen_file { # my($fn, $span)= @_; # # my $bn= basename($fn); # print STDERR "# Nimblegen signal span:$span, file: $bn\n" if $debug; # # unless ($bn =~ m/gr_(\d+)_(\S+)_(\w+)_(\d+)[\._]sig\.gr/) { # # die "affy_grfile: cant parse data from filename: $bn"; # # # return {}; # # } # # my $sid=$1; # # my $sna=$2; # # my $chr=$3; # # my $bw= $4; # # ## get this info from gff.type : data_10533901_532 or file name # # my $sid=$FILE_NUM; #? global var?; this needs to be numeric # my $sna= $bn; # my $chr=""; # many in file gff # my $bw= 0; # # my $wformat="NimbleScan"; ## chrom=$chr span=$span"; # # unless($span) { die "affy_grfile: need option -span N for tile width"; } # # my $track="track type=wiggle_0 name=\"$sna\" description=\"nimblescan,id:$sid,log:$dolog\""; # my $source= join("_","nsc",$sid); # my $method= "microarray_oligo"; # default # my $trackname= 'track'.($sid-1); # # return { wformat => $wformat, source => $source, method => $method, # track => $track, trackname=>$trackname, # sid => $sid, }; # } # __END__