#!/usr/bin/perl # nimgen2gff.pl # process nimblegen NimbleScan tabular (gff-like) input to various outputs use lib('/bio/argos/work/gbrowse/Generic-Genome-Browser-1.69/lib'); #debug use strict; use Getopt::Long; use POSIX; use File::Basename; use File::Spec::Functions qw/ catdir catfile /; use Bio::Graphics::Wiggle; my $FIX_REF_SCAF = 0; # for Daphnia nimgen, not DrosMel my $INTERGENE = 500; #?? what is useful distance to split pseudo-gene chunks? my $NOXSCORE = 900; # arbitrary; should calculate from region? my($format,$bedsource,$maxscore,$meanscore,$fix,$loc,$debug); # $range,$hints,$parts,$dobed,$dowig .. see below my($maxsig, $minsig, $tilewidth,$stepspan); # wig my $dolog=0; # never? my $sotype="region"; # for wig my $source="nimbsig"; # for wig my $optok= GetOptions( "format=s",\$format, # these are all formats: drop for that one option ... "fix!",\$fix, #"maxscore!",\$maxscore, "meanscore!",\$meanscore, #"range!",\$range, "parts!",\$parts, "hints!",\$hints, "intergene=n",\$INTERGENE, "noscore=n",\$NOXSCORE, "REF_SCAF!",\$FIX_REF_SCAF, "tilewidth|span=s",\$tilewidth, # only for wig "stepspan!", \$stepspan, # special flag for wig, create compressed step==span data "minscore=s",\$minsig, "maxscore=s",\$maxsig, # only for wig "bedsource=s",\$bedsource, # alternate to gff after simple nimscangff > bed ## .. do we want these? # "output|gff=s",\$gffout, # "dirout=s",\$dirout, # "input|sigin=s",\@sigin, "location=s",\$loc,"debug!",\$debug, ); $optok=0 if $format =~ /^wig/i && $tilewidth < 2; $optok=0 if $format =~ /^wig/i && !(defined $maxsig && defined $minsig); my $USAGE= "Reformat gff from NimbleGen (NimbleScan) usage1: cat nimgen-original.gff | perl nimgen2gff3.pl -fix > nimgen-fixed.gff usage2: cat nimgen-fixed.gff | sort -k1,1 -k4,4n -k5,5nr | \\ perl nimgen2gff3.pl -format maxscore > nimgen-maxscore.gff usage3: cat nimgen-maxscore.gff | perl nimgen2gff3.pl -format hint > nimgen.hints cat nimgen-maxscore.gff | perl nimgen2gff3.pl -format range > nimgen-range.gff cat nimgen-maxscore.gff | perl nimgen2gff3.pl -format wig > nimgen-tracks.gff * also writes .wig files for each chrom,source formats: fix = correct NimbleScan GFF to GFF v3 standard options: -format fix|range|parts|hints|wig|bed|gff -bedsource HX1_U01_Drosophila.mean0.cgdiff : input format option for wig format -debug | -location scaffold_4:2366000-2391100 fix format: -[no]ref_scaf : fix reference scaffold (Daphnia nimgen gff, not DrosMel) wig format: -tilewidth|span=50 -minscore=500 -maxscore=25000 -stepspan : tile span is wig step size (experimental; compressed wig) NOTE: formats wig|maxscore|meanscore assume input gff is sorted by location "; die $USAGE unless($optok and $format); my($lref,$lbegin,$lend)=split/[:.-]+/,$loc if($loc); ## unbuffer $gffouth $| = 1; # STDOUT unbuffer my $gffouth= *STDOUT; # if($gffout && $gffout !~ /^(stdout|\-)/) { # open(GFFOUT,">$gffout") or die "$gffout"; # $gffouth= *GFFOUT; # } FORMAT: { $format =~ /^fix/i && do{ fix_nimblegen_gff(); last FORMAT }; $format =~ /^maxsc/i && do{ $maxscore=1; gff_max_score(); last FORMAT }; $format =~ /^meansc/i && do{ $meanscore=1; gff_max_score(); last FORMAT }; $format =~ /^range/i && do{ gff_match_range(); last FORMAT }; $format =~ /^part/i && do{ gff_match_parts(); last FORMAT }; $format =~ /^hint/i && do{ gff_range_hints(); last FORMAT }; $format =~ /^wig/i && do{ gff_wig($tilewidth,$maxsig,$minsig,$bedsource); last FORMAT }; $format =~ /^subhint/i && do{ gff_wig($tilewidth,$maxsig,$minsig,$bedsource, 1); last FORMAT }; $format =~ /^bed/i && do{ gff_bed(); last FORMAT }; warn "Dont know format $format"; # die } #------------------------------- sub fix_nimgene_line { local $_ = shift; return $_ unless(/^\w/); # defined GFF columns (all versions); ref is useless "SCAFFOLD" in nimblegen gff # fixme: for Dros NimGen have different gff cols: 1st is DroMel CHR (2L,3R,..no prefix) # then attrib: seq_id=chr2L:1-23011544 ; probe_id=CHR02LFS013110143; chomp; my($refbad,$src,$datatype,$begin,$end,$score,$orient,$phase,$attr)=split "\t"; my($refgood); if($FIX_REF_SCAF) { # ($refgood)= $attr =~ m/(SCAFFOLD_\w+)/; # daphnia nimgen $refgood= ($attr =~ m/seq_id=(\w+)/) ? $1 : $refbad; # more general $refgood= lc($refgood); # does nimblegen uppercase everything? } else { $refgood= $refbad; #otherwise ref col 1 is ok } #.. next if($lend and ($refgood ne $lref or $begin > $lend or $end < $lbegin)); $attr =~ s/seq_id=[^;]+;//; # this is ref + location; hopefully same as chr,chr1-chrEnd $attr =~ s/probe_id=/ID=/; # convert to GFF id $attr .= ";datatype=$datatype"; # need Seq. Ontology "match" type; preserve as attrib. return join("\t",$refgood,$src,"match",$begin,$end,$score,$orient,$phase,$attr),"\n"; } sub fix_nimblegen_gff { print "##gff-version 3\n"; print "# location: $lref:$lbegin-$lend\n" if($lend); while(<>) { unless(/^\w/){ print unless(/^##gff-version/); next; } # print any comments chomp; # defined GFF columns (all versions); ref is useless "SCAFFOLD" in nimblegen gff # fixme: for Dros NimGen have different gff cols: 1st is DroMel CHR (2L,3R,..no prefix) # then attrib: seq_id=chr2L:1-23011544 ; probe_id=CHR02LFS013110143; my($refbad,$src,$datatype,$begin,$end,$score,$orient,$phase,$attr)=split "\t"; my($refgood); if($FIX_REF_SCAF) { # ($refgood)= $attr =~ m/(SCAFFOLD_\w+)/; # daphnia nimgen $refgood= ($attr =~ m/seq_id=(\w+)/) ? $1 : $refbad; # more general $refgood= lc($refgood); # does nimblegen uppercase everything? } else { $refgood= $refbad; #otherwise ref col 1 is ok } next if($lend and ($refgood ne $lref or $begin > $lend or $end < $lbegin)); $attr =~ s/seq_id=[^;]+;//; # this is ref + location; hopefully same as chr,chr1-chrEnd $attr =~ s/probe_id=/ID=/; # convert to GFF id $attr .= ";datatype=$datatype"; # need Seq. Ontology "match" type; preserve as attrib. print join("\t",$refgood,$src,"match",$begin,$end,$score,$orient,$phase,$attr),"\n"; } } # my $today= POSIX::strftime("%Y%m%d",localtime( $^T)); sub filedate { my $fp= shift; my $ftime= $^T; $ftime -= 24*60*60*(-M $fp) if $fp; # 0 if not exist; default today return POSIX::strftime("%Y%m%d", localtime( $ftime )); } =item gff_wig this reads *fixed* and color-combined Nimblegen gff, sorted by location, and writes wiggle data files using Bio/Graphics/Wiggle Required options now -tilewidth : assumes, uses fixed tile width even though there is some slop in practice -minscore : minimum tile score -maxscore : max tile score set dname=me8nsc731 set ndir=$sc/dmel5/modenc/nimblegen/ melon.% ls $ndir/2*max.gff.gz dmel5/modenc/nimblegen/246498C03_HX1_U01_Drosophila_max.gff.gz dmel5/modenc/nimblegen/246526C03_HX1_U01_Drosophila_max.gff.gz dmel5/modenc/nimblegen/246533C02_HX1_U01_Drosophila_max.gff.gz dmel5/modenc/nimblegen/246535C02_HX1_U01_Drosophila_max.gff.gz dmel5/modenc/nimblegen/246570C01_HX1_U01_Drosophila_max.gff.gz dmel5/modenc/nimblegen/246629C03_HX1_U01_Drosophila_max.gff.gz dmel5/modenc/nimblegen/270982C01_HX1_U01_Drosophila_max.gff.gz mkdir drosmelme/$dname cd drosmelme/$dname gunzip -c $ndir/2*max.gff.gz |\ perl -I$men $men/nimgen2gff.pl -debug -format wig -tilewidth=50 -minscore=500 -maxscore=25000 \ > $dname-tracks.gff =item alternate .bed input from foreach nn ( $nimgf ) if ( -e $nn.bed ) continue gunzip -c $nimd/$nn.gff.gz | perl -ne \ 'next unless(/^\w/);($r,$s,$t,$b,$e,$v)=split; print join("\t",$r,$b,$e,$v),"\n";' > $nn.bed echo done $nn.bed end =item drosmelme data files cDNA, gDNA in pair1, pair2; reverse in repl 3 cDNA gDNA #replicate 1 246569C01_HX1_U02_Drosophila_532 246569C01_HX1_U02_Drosophila_635 246533C02_HX1_U01_Drosophila_532 246533C02_HX1_U01_Drosophila_635 246498C03_HX1_U01_Drosophila_532 246498C03_HX1_U01_Drosophila_635 #replicate 2 270982C01_HX1_U01_Drosophila_532 270982C01_HX1_U01_Drosophila_635 246535C02_HX1_U01_Drosophila_532 246535C02_HX1_U01_Drosophila_635 246526C03_HX1_U01_Drosophila_532 246526C03_HX1_U01_Drosophila_635 #replicate 3 246570C01_HX1_U01_Drosophila_635 246570C01_HX1_U01_Drosophila_532 246534C02_HX1_U02_Drosophila_635 246534C02_HX1_U02_Drosophila_532 246629C03_HX1_U01_Drosophila_635 246629C03_HX1_U01_Drosophila_532 perl compare #................... See ../dspp2work/dmeltilex/drosmelme-nimcombo.pl #!/usr/bin/perl # drosmelme-nimcombo.pl # this is vv---gDNA---vv vv------cDNA----vv @pairfiles= qw( 246569C01_HX1_U02_Drosophila_532 246569C01_HX1_U02_Drosophila_635 246533C02_HX1_U01_Drosophila_532 246533C02_HX1_U01_Drosophila_635 246498C03_HX1_U01_Drosophila_532 246498C03_HX1_U01_Drosophila_635 270982C01_HX1_U01_Drosophila_532 270982C01_HX1_U01_Drosophila_635 246535C02_HX1_U01_Drosophila_532 246535C02_HX1_U01_Drosophila_635 246526C03_HX1_U01_Drosophila_532 246526C03_HX1_U01_Drosophila_635 246570C01_HX1_U01_Drosophila_635 246570C01_HX1_U01_Drosophila_532 246534C02_HX1_U02_Drosophila_635 246534C02_HX1_U02_Drosophila_532 246629C03_HX1_U01_Drosophila_635 246629C03_HX1_U01_Drosophila_532 ); @pairs= @pairfiles; for($irep=0; $irep<3; $irep++) { for($jpart=0; $jpart<3; $jpart++) { ($gdna,$cdna)= splice(@pairs,0,2); $group[$jpart][$irep]= [$gdna,$cdna]; } } print "#rep gdna cdna\n"; for($jpart=0; $jpart<3; $jpart++) { print "# part: $jpart, reps 1..3\n"; for($irep=0; $irep<3; $irep++) { ($gdna,$cdna)= @{ $group[$jpart][$irep] }; showpair($jpart, $irep, $cdna,$gdna); } } #.... sub showpair { my($part, $rep, $af, $bf, $nshow)= @_; $nshow ||= 10; my $ishow= 0; my $istep=1000; $nshow = $nshow * $istep; print "# part:$part, rep:$rep, pair: $af $bf\n"; open(DA,"gunzip -c $af.bed.gz|"); open(DB,"gunzip -c $bf.bed.gz|"); print join("\t",qw(ref start end Aval Bval Diff)),"\n"; while() { $ishow++; last if($ishow > $nshow); my $ra=$_; my $rb= ; chomp($ra); chomp($rb); my @va=split "\t", $ra; my @vb=split "\t", $rb; $va[3]= int($va[3]); $vb[3]= int($vb[3]); my $err= 0; for $k (0..2) { $err=1 if($va[$k] ne $vb[$k]); } my $diff= $va[3] - $vb[3]; if($err) { print "ERR $af: < $ra\n > $rb \n"; } else { print join("\t",@va,$vb[3], $diff),"\n" if ($ishow % $istep == 0); } } print "\n\n"; close(DA); close(DB); } =cut sub gff_wig { my($tilewidth,$maxsig,$minsig, $bedsource, $ashints)= @_; # note: @ARGV may be file names or may have input on STDIN # my $sigin = *STDIN; #? my $result; #if $gffouth ?? print $gffouth "##gff-version 3\n"; print $gffouth "# nimgen2wig date=",filedate(),"\n"; print $gffouth "# tilewidth=$tilewidth, minscore=$minsig, maxscore=$maxsig, log=$dolog\n"; print $gffouth "\n"; if(@ARGV) { ## add this and @ARGV files foreach my $sigin (@ARGV) { warn "# gff_wig read $sigin\n" if $debug; my $ok=0; local *SIGIN; if($sigin =~ /\.bz2$/) { $ok= open(SIGIN,"bzcat $sigin|"); } elsif($sigin =~ /\.gz$/) { $ok= open(SIGIN,"gunzip -c $sigin|"); } else { $ok= open(SIGIN,$sigin); } unless($ok) { warn "cant open $sigin\n"; next; } # or die my $fileid= fileparse($sigin); # doesnt drop suffix :( $fileid =~ s/\.(gz|bz2)$//; $fileid =~ s/\.(bed|gff)$//; my $thisbed= ($bedsource) ? $fileid : undef; print $gffouth "\n# source=$fileid\n"; if ($ashints) { nimgen2hints( *SIGIN, $tilewidth, $thisbed ); } else { $result .= nimgen2wig( *SIGIN, $tilewidth, $thisbed ); } close(SIGIN); } } else { warn "#gff_wig read STDIN\n" if $debug; if ($ashints) { nimgen2hints( *STDIN, $tilewidth, $bedsource ); } else { $result .= nimgen2wig( *STDIN, $tilewidth, $bedsource ); } } #see below# print $gffouth join("\n", @$result), "\n"; } =item subdivide_tiles_forwig * change strategy here from above for .bed -- ask and return fixed number/width of parts / tile -- for part areas of overlap, calc ave score weighted by distance from center of (two) tiles -- for part areas w/ no overlap, just current p tile score =cut sub subdivide_tiles_forwig { my($p, $p0, $p1, $partwidth)= @_; # points in .bed array [ref,b,e,score] return [] unless(ref $p); my($pa,$pb,$pc)= (undef) x 3; my($ref,$rb,$re,$rv,$rsource)= @$p; my $rc= int(($rb+$re)/2); my $rw= int(abs($re - $rb)/2); my $np= int( $rw / $partwidth ); # dont need p->ref here my @parts; for(my $pb= $rb; $pb < $re; $pb += $partwidth) { my $pe= $pb + $partwidth; # dont need for wig $pe= $re if $pe > $re; my $pc= int(($pb+$pe)/2); # center of current subtile my $distr= abs($rc - $pc); # distance from center of full tile my $pv= $rv; # subtile score, what we change my $over= 0; if(ref $p0 and $over < 1) { $over= ($p0->[0] eq $ref) ? $p0->[2] - $pb : 0; $over= $partwidth if $over > $partwidth; if($over > 2) { my $v0= $p0->[3]; # overlap score my $c0= int(( $p0->[1] + $p0->[2])/2); my $dist0= abs($c0 - $pc); # distance to overlap tile center my $weight0= 1 - ($dist0 / ($dist0 + $distr)); # 1 if near c0, overlap center my $weightr= 1 - $weight0; # ($distr / ($dist0 + $distr)); # 1 if near rc, this center my $pover = $over / $partwidth; my $newv= (1 - $pover) * $rv # portion not over + ($pover) * ( $v0 * $weight0 + $rv * $weightr ); $pv= int($newv); # int option? } } if(ref $p1 and $over < 1) { $over= ($p1->[0] eq $ref) ? $pe - $p1->[1] : 0; $over= $partwidth if $over > $partwidth; if($over > 2) { my $v0= $p1->[3]; my $c0= int(( $p1->[1] + $p1->[2])/2); my $dist0= abs($c0 - $pc); my $weight0= 1 - ($dist0 / ($dist0 + $distr)); my $weightr= 1 - $weight0; # ($distr / ($dist0 + $distr)); my $pover = $over / $partwidth; my $newv= (1 - $pover) * $rv # portion not over + ($pover) * ( $v0 * $weight0 + $rv * $weightr ); $pv= int($newv); # option? } } my $pa= [$ref, $pb, $pe, $pv, $rsource]; push @parts, $pa; } return \@parts; } sub trackhead_forwig { my( $chr, $src, $typ, $tilewidth)= @_; my $seqid= $src; #?? $seqid =~ s/^[a-z]+\.(\d+)/$1/; # drop silly prefix my $name= join("_","nimgen",$seqid); # vis label; not same as trackname my $date= filedate(); # fixme my $trackname= join(".",'track'.$seqid,"nimgen",$chr,$date); my $wigfile= "$trackname.wig"; # fixme warn "# write to $wigfile\n" if $debug; my $writeable=1; my $step= ($stepspan) ? $tilewidth : 1; # testing; need to fix wiggle.pm to use well my $wig = Bio::Graphics::Wiggle->new( $wigfile, $writeable, { seqid => $trackname, # step => $tilewidth, ## ** misuse of step for variable spaced wig data step => $step, span => $tilewidth, min => $minsig, max => $maxsig, }); return ($wig, $wigfile, $name); } sub trackgff { my( $wigout, $chr, $start, $stop, $source, $sotype, $name, $infile) = @_; my @attr= (); # push @attr, "tid=$seqid" if $seqid; push @attr, "Name=$name" if $name; push @attr, "wigfile=$wigout"; # must have # push @attr, "infile=".basename($infile) if $infile; my $gffline= join("\t", $chr, $source, $sotype, $start, $stop, ".",".",".", join(";",@attr) ); print $gffouth $gffline,"\n" if $gffouth; return $gffline; } # do this per input file sub nimgen2wig { my( $sigin, $tilewidth, $bedsource )= @_; # alternate .bed input needs file source string my $frombed= ($bedsource =~ /\w/) ? 1: 0; my $subtilewidth= int((4+$tilewidth) / 5); # should this be user option? ; now = 10 for 50b nimgen tiles my ($wig, $wigfile, $visname); my($p0, $p, $p1)= (undef) x 3; my($lref, $lsrc, $nvals); my @trackgff; # or write to stdout now ? my($chrstart,$chrstop) = (undef) x 2; while(<$sigin>) { next unless(/^\w/); chomp; my($ref,$rsrc,$typ,$rb,$re,$score); if($frombed) { ($ref,$rb,$re,$score) = split"\t"; $rsrc= $bedsource; } else { ($ref,$rsrc,$typ,$rb,$re,$score)= split "\t"; } $p0= $p; $p = $p1; $p1= [$ref,$rb,$re,$score, $rsrc]; my $locs= subdivide_tiles_forwig( $p, $p0, $p1, $subtilewidth); foreach my $loc (@$locs) { my($cr,$cb,$ce,$cv,$csrc)= @$loc; $cv= int($cv); # dont need decimals for scores in 500 .. 65000 range; option? unless(defined $lref && $cr eq $lref && $csrc eq $lsrc) { push( @trackgff, trackgff( $wigfile, $lref, $chrstart, $chrstop, $lsrc, $sotype, $visname) ) if $lref; ($chrstart,$chrstop) = (undef) x 2; ## need to create new wig file here *** ($wig, $wigfile, $visname)= trackhead_forwig( $cr,$csrc,$typ, $subtilewidth); ($lref,$lsrc)= ($cr,$csrc); } die "bad wig object at $cr,$csrc,$typ,$cb " unless $wig; $wig->set_value( $cb => $cv); $nvals++; $chrstop = $ce unless(defined $chrstop and $chrstop > $ce); $chrstart= $cb unless(defined $chrstart and $chrstart < $cb); } } undef $wig; # should close it? warn "# tilepart=$subtilewidth, nvals=$nvals\n" if $debug; push( @trackgff, trackgff( $wigfile, $lref, $chrstart, $chrstop, $lsrc, $sotype, $visname) ) if $lref; return \@trackgff; } # merge of hints + nimgen tile averagin sub nimgen2hints { my( $sigin, $tilewidth, $bedsource )= @_; # alternate .bed input needs file source string my $frombed= ($bedsource =~ /\w/) ? 1: 0; my $subtilewidth= int((4+$tilewidth) / 5); # should this be user option? ; now = 10 for 50b nimgen tiles my $hinttype="ep"; #?? exonpart my ($wig, $wigfile, $visname); my($p0, $p, $p1)= (undef) x 3; my($lref, $lsrc, $nvals); my @trackgff; # or write to stdout now ? my($chrstart,$chrstop) = (undef) x 2; my $ipar=0; while(<$sigin>) { next unless(/^\w/); chomp; my($ref,$rsrc,$typ,$rb,$re,$score); if($frombed) { ($ref,$rb,$re,$score) = split"\t"; $rsrc= $bedsource; } else { ($ref,$rsrc,$typ,$rb,$re,$score)= split "\t"; } $ipar++; #? $p0= $p; $p = $p1; $p1= [$ref,$rb,$re,$score, $rsrc]; my $locs= subdivide_tiles_forwig( $p, $p0, $p1, $subtilewidth); foreach my $loc (@$locs) { my($cr,$cb,$ce,$cv,$csrc)= @$loc; $cv= int($cv); # dont need decimals for scores in 500 .. 65000 range; option? my $parid="${csrc}_R${ipar}"; print $gffouth join("\t",$cr,$csrc,$hinttype,$cb,$ce,$cv,".",".","grp=$parid;src=N;pri=9"),"\n"; $chrstop = $ce unless(defined $chrstop and $chrstop > $ce); $chrstart= $cb unless(defined $chrstart and $chrstart < $cb); } } } =item nimgen2wig data bug : FIXED (test) # some kind of problem here: got data put into wrong chr,src files >> there is no X chrom in this source file: dmel5/modenc/nimblegen/270982C01_HX1_U01_Drosophila_max.gff.gz melon.% gunzip -c $ndir/270982C01*max.gff.gz | perl -ne'($r,$s,$t,$b,$e)=split; print "$r\n";' | sort | uniq -c 1167325 2L 895060 2R ? any chance another _max.gff has wrong data (source = 270982C01)? ? could it come from mismatch of (ref,rb,re,src) and @locs[cr,cb,ce]? >> but got this melon.% ll drosmelme/$dname/*270982C01* 2300714 Sep 1 15:02 drosmelme/me8nsc731/track270982C01.nimgen.2L.20080901.wig 1791177 Sep 1 15:06 drosmelme/me8nsc731/track270982C01.nimgen.2R.20080901.wig 2242487 Sep 1 14:58 drosmelme/me8nsc731/track270982C01.nimgen.X.20080901.wig << >> gff X nsc270982C01 nscregion 5146 5195 ... Name=NimbleScan.270982C01;wigfile=../databases/drosmelme/me8nsc731/track270982C01.nimgen.X.20080901.wig melon.% ll -t drosmelme/$dname/*.wig 1791177 Sep 1 15:06 drosmelme/me8nsc731/track270982C01.nimgen.2R.20080901.wig 2300714 Sep 1 15:02 drosmelme/me8nsc731/track270982C01.nimgen.2L.20080901.wig 2242487 Sep 1 14:58 drosmelme/me8nsc731/track270982C01.nimgen.X.20080901.wig << bogus ^^^ some kind of slop over from last data wig file 2242484 Sep 1 14:58 drosmelme/me8nsc731/track246629C03.nimgen.X.20080901.wig ^^^ last correct wig output >> other slop over cases? where ever new file/source-type is openned? >> because of this lag : p = p1; p1 = new ref,rb,re >> then source changes one step before chr changes ; >> last chr tile mistakenly written to new bogus wig file >> then new chr-src wig file is openned 1791179 Sep 1 14:50 drosmelme/me8nsc731/track246629C03.nimgen.2R.20080901.wig << bogus 1791177 Sep 1 14:50 drosmelme/me8nsc731/track246570C01.nimgen.2R.20080901.wig 1247455 Sep 1 14:43 drosmelme/me8nsc731/track246570C01.nimgen.3R.20080901.wig << bogus 1247455 Sep 1 14:43 drosmelme/me8nsc731/track246535C02.nimgen.3R.20080901.wig 2242487 Sep 1 14:28 drosmelme/me8nsc731/track246533C02.nimgen.X.20080901.wig << bogus 2242487 Sep 1 14:28 drosmelme/me8nsc731/track246526C03.nimgen.X.20080901.wig 2790132 Sep 1 14:24 drosmelme/me8nsc731/track246526C03.nimgen.3R.20080901.wig < ok 2242484 Sep 1 14:21 drosmelme/me8nsc731/track246498C03.nimgen.X.20080901.wig >> from $dname-tracks.gff .. in SLOP cases, .wig file replaced w/ good data #BAD.SLOP#X nsc246526C03 nscregion 12471963 12472007 . . . Name=NimbleScan.246526C03;wigfile=../databases/drosmelme/me8nsc731/track246526C03.nimgen.X.20080901.wig #BAD#X nsc246533C02 nscregion 17909231 17909278 . . . Name=NimbleScan.246533C02;wigfile=../databases/drosmelme/me8nsc731/track246533C02.nimgen.X.20080901.wig #BAD.SLOP#3R nsc246535C02 nscregion 17909231 17909278 . . . Name=NimbleScan.246535C02;wigfile=../databases/drosmelme/me8nsc731/track246535C02.nimgen.3R.20080901.wig #BAD#3R nsc246570C01 nscregion 5146 5195 . . . Name=NimbleScan.246570C01;wigfile=../databases/drosmelme/me8nsc731/track246570C01.nimgen.3R.20080901.wig #BAD# 2R nsc246629C03 nscregion 12471963 12472007 . . . Name=NimbleScan.246629C03;wigfile=../databases/drosmelme/me8nsc731/track246629C03.nimgen.2R.20080901.wig #BAD# X nsc270982C01 nscregion 5146 5195 . . . Name=NimbleScan.270982C01;wigfile=../databases/drosmelme/me8nsc731/track270982C01.nimgen.X.20080901.wig =cut =item gff_bed ** FIXME *** FIXME: .bed and wiggle dont understand overlapped tiles and they wipe out 1/2 data due to that (at least gbrowse wiggle) Need here to calculate subdivided tiles from overlapping tiles and write out that for .bed uses *** =item subdivide_tiles See in .. ggb169/lib/Bio/Graphics/Glyph/tilexyplot.pm sub subdivide_tiles subclass _draw_boxes to handle overlapped tile array features. want to average and subdivide overlaps to give finer grain expression peaks. e.g. gff: chr1 nsc region 100 150 500 ... chr1 nsc region 125 175 900 ... chr1 nsc region 150 200 300 ... >> split overlaps and give ave. score; might do better with running/smoothed aver. >> i.e. should show peak nearer 900 in 140..160 region, with drop off either side chr1 nsc region 100 124 500 ... chr1 nsc region 125 149 700 ... chr1 nsc region 150 174 600 ... chr1 nsc region 175 200 300 ... version 2 for gff input stream: - take 3 position window, p0, p, p1, as above (special case edges) - p => pa [.33 width] at p -1/3w, pb [.33 width] at p, pc [.33w] at p+1/3w ** subdivide into 3 parts? with score at center ~ original score chr1 nsc region 100 150 500 ... > 113..137, ... , 125 +/-4, ... chr1 nsc region 125 175 900 ... > 138..162, 142 +/-4, 150 +/-4, 158 +/-4 chr1 nsc region 150 200 300 ... > 163..187, ..., 175 +/-4, ... score(pa) = s(p) * 0.66 + s(p0) * 0.33 score(pb) = s(p) score(pc) = s(p) * 0.66 + s(p1) * 0.33 =cut sub subdivide_tiles { my($p, $p0, $p1)= @_; # points in .bed array [ref,b,e,score] return [] unless(ref $p); my($pa,$pb,$pc)= (undef) x 3; my($ref,$rb,$re,$rv)= @$p; $pb= [$ref, $rb, $re, $rv]; # default same as input $p my $rc= int(($rb+$re)/2); if(ref $p0) { my $over= ($p0->[0] eq $p->[0]) ? $p0->[2] - $p->[1] : 0; if($over > 2) { my $wd= int($over/2); # split overlap b/n last and this loc my $rbnew= $rb + $wd; my $rmid= int(($rc + $rbnew)/2); my $smid= $rv * 0.66 + $p0->[3] * 0.33; $pa= [$ref, $rbnew, $rmid, $smid]; $pb->[1]= $rmid; } } if(ref $p1) { my $over= ($p1->[0] eq $p->[0]) ? $p->[2] - $p1->[1] : 0; if($over > 2) { my $wd= int($over/2); # split overlap b/n last and this loc my $renew= $re - $wd; my $rmid= int(($rc + $renew)/2); my $smid= $rv * 0.66 + $p1->[3] * 0.33; $pc= [$ref, $rmid, $renew, $smid]; $pb->[2]= $rmid; } } return [$pa,$pb,$pc] if($pa and $pc); return [$pa,$pb] if ($pa); return [$pb,$pc] if ($pc); return [$pb]; } sub trackhead { my($ref,$src,$typ)= @_; my $nm= join("_",$src,$ref); my $trackd= "track type=wiggle_0 name=\"$nm\"" . " description=\"Nimblegen overlapped tile split source:$src, type:$typ, chr:$ref\""; print $trackd,"\n"; } sub gff_bed { ## this assumes input gff is location-sorted my $sotype="transcribed_region"; #?? expressed_sequence_match my $pmin= $NOXSCORE; my($p0, $p, $p1)= (undef) x 3; my($lr, $ls); while(<>) { next unless(/^\w/); my @v= split; my($ref,$src,$typ,$rb,$re,$score)= @v; # # add track lines somewhere, from $src,$typ ?? # if($ref ne $lr or $src ne $ls) { trackhead($ref,$src,$typ); } # print join("\t",$ref,$rb-1,$re,$score),"\n"; # note 0-based * $p0= $p; $p = $p1; $p1= [$ref,$rb,$re,$score]; my $locs= subdivide_tiles( $p, $p0, $p1); foreach my $loc (@$locs) { my($cr,$cb,$ce,$cv)= @$loc; $cv= int($cv); # dont need decimals for scores in 500 .. 65000 range; option? if($cr ne $lr or $src ne $ls) { trackhead($cr,$src,$typ); ($lr,$ls)= ($cr,$src); } print join("\t",$cr,$cb-1,$ce,$cv),"\n"; # note 0-based * } # ($lr,$ls)= ($ref,$src); } $p0= $p; $p = $p1; $p1= undef; my $locs= subdivide_tiles( $p, $p0, $p1); foreach my $loc (@$locs) { my($cr,$cb,$ce,$cv)= @$loc; $cv= int($cv); # dont need decimals for scores in 500 .. 65000 range; option? print join("\t",$cr,$cb-1,$ce,$cv),"\n"; # note 0-based * } } sub gff_max_score { my(@lv,$lr,$lb,$le,$pmax,$psum,$np,@p); while(<>) { print and next unless(/^\w/); my @v= split; my($r,$s,$t,$b,$e,$p)= @v; # same as above $ref,.. if($r eq $lr and $b eq $lb and $e eq $le) { # same location $np++; push(@p,$p); $psum += $p; $pmax= $p if($pmax<$p); } else { # new location if(@lv) { $lv[-1].=";np=$np;pa=".join(",",@p) if($debug); $lv[5]= ($meanscore) ? $psum/$np : $pmax; print join("\t",@lv),"\n"; } $psum= $pmax= $p; $np=1; @p=($p); } @lv=@v; ($lr,$lb,$le)=($r,$b,$e); # save last location } if(@lv) { $lv[-1].=";np=$np;pa=".join(",",@p) if $debug; $lv[5]= ($meanscore) ? $psum/$np : $pmax; print join("\t",@lv),"\n"; } } sub gff_match_range { my(@lv,$rb,$lr,$lb,$ls,$le,$psum,$pn,$par); my $sotype="transcribed_region"; #?? expressed_sequence_match my $sopart="HSP"; #?? expressed_sequence_match my $source="NSCx"; $par=1; $le=$rb=0; my $pmin= $NOXSCORE; while(<>) { print and next unless(/^\w/); my @v= split; my($r,$s,$t,$b,$e,$p)= @v; # same as above $ref,.. # next if($p <= $NOXSCORE); # == no score/off # range output should include main/part types: region (sep by > INTERGENE) and HSP (sep by < INTER) ## need also end last range when p < noxscore if(($r ne $lr or $b - $le > $INTERGENE)) { # $p > $NOXSCORE and my $size=1+$le-$rb; if($r eq $lr and $le>0 and $size > 100 and $psum > 2 * $pmin) { $pn||=1; my $pa= int($psum/$pn); my $intr=($b-$le-1); my $size=1+$le-$rb; my $parid="${source}_R${par}"; # urk must use Last par id; not next; print HSPs before region print join("\t",$lr,$source,$sotype,$rb,$le,$pa,".",".","ID=$parid;size=$size;intr=$intr"),"\n"; } $par++; $rb= $b; $pn= $psum=0; $pmin= $NOXSCORE; ($lr,$lb,$le,$ls)=($r,$b,$e,$s); # save last location } if($p > $pmin) { $psum += $p; $pn++; $pmin= $p if($p> $NOXSCORE and $p < (0.3 * $psum/$pn)); # try to find better expr/noexpr bound ($lr,$lb,$le,$ls)=($r,$b,$e,$s); # save last location } #($lr,$lb,$le,$ls)=($r,$b,$e,$s); # save last location } # end: if($rb < $le) { $pn||=1; my $pa= int($psum/$pn); my $intr=($le-$rb-1); my $size=1+$le-$rb; my $parid="${source}_R${par}"; # urk must use Last par id; not next; print HSPs before region print join("\t",$lr,$source,$sotype,$rb,$le,$pa,".",".","ID=$parid;size=$size;intr=$intr"),"\n"; } } # for augustus hints, with grp= based on intergene separation sub gff_range_hints { my(@lv,$rb,$lr,$lb,$le,$psum,$pn,$par); my $hinttype="ep"; #?? exonpart #my $source="NSCx"; $par=1; $le=$rb=0; my $pmin= $NOXSCORE; my($r,$source,$t,$b,$e,$p); while(<>) { print and next unless(/^\w/); my @v= split; ($r,$source,$t,$b,$e,$p)= @v; # same as above $ref,.. next if($p < $NOXSCORE); #? skip low score or mark these? if(1) { # ($r ne $lr or $b - $le > $INTERGENE) my $size= 1+$le-$rb; #if($r eq $lr and $le>0 and $p > $pmin) { # and $size > 100 #$pn||=1; my $pa= int($psum/$pn); #my $intr=($b-$le-1); my $size=1+$le-$rb; #my $parid="${source}_R${par}"; #print join("\t",$r,$source,$hinttype,$b,$e,$p,".",".","grp=$parid;src=N;pri=9"),"\n"; #} if($r ne $lr or $b - $le > $INTERGENE) { # new parent/possible gene bound # my $intr=($b-$le-1); my $size=1+$le-$rb; $par++; $rb= $b; $pn= $psum=0; $pmin= $NOXSCORE; ($lr,$lb,$le)=($r,$b,$e); # save last location } } if($p > $pmin) { $psum += $p; $pn++; $pmin= $p if($p> $NOXSCORE and $p < (0.3 * $psum/$pn)); # try to find better expr/noexpr bound ($lr,$lb,$le)=($r,$b,$e); # save last location } if($p>$NOXSCORE) { my $parid="${source}_R${par}"; print join("\t",$r,$source,$hinttype,$b,$e,$p,".",".","grp=$parid;src=N;pri=9"),"\n"; } } } sub gff_match_parts { ## revise this to average and split overlapped score parts my(@lv,$rb,$lr,$lb,$ls,$le,$psum,$pn,$par); my $sotype="HSP"; #?? expressed_sequence_match my $source="NSCx"; $par=1; while(<>) { print and next unless(/^\w/); my @v= split; my($r,$s,$t,$b,$e,$p)= @v; # same as above $ref,.. #?? dont skip these, but dont change parent # next if($p < $NOXSCORE); # == no score/off if($p > $NOXSCORE and ($r ne $lr or $b - $le > $INTERGENE)) { $par++; $rb= $b; $pn= $psum=0; } ($lr,$lb,$le,$ls)=($r,$b,$e,$s); # save last location print join("\t",$r,$source,$sotype,$b,$e,$p,".",".","Parent=${source}_R${par}"),"\n"; } } __END__ DrosMel nimgen (in two color sets): ngen=/c7/eugenes/daphnia/genomes/Daphnia_pulex/tilepatharrays/ * Dear Don, - you mixed up cDNA and gDNA samples by taking max of the color samples {532,635} - want each of 3 bio replicates x cDNA,gDNA per chromosome as viewable ? > prefer just 3 views , but keep all separate samples as option... (a) best answer from cDNA - gDNA average/combine of replcates (b) best combo of just cDNA, and (c) of just gDNA ... ............................. cat 246498C03_HX1_U01_Drosophila_{532,635}.gff | $ngen/nimgen2gff.pl -fix | \ sort -k1,1 -k4,4n -k5,5nr | $ngen/nimgen2gff.pl -maxscore \ > ../246498C03_HX1_U01_Drosophila_max.gff perl -pi -e's/;count=1;datatype=data_.+$//; s/NimbleScan/nsc.246498C03/;'\ ../246498C03_HX1_U01_Drosophila_max.gff #!/bin/csh # fixdmelnimg.sh set nidset=(246498C03 246526C03 246533C02 246535C02 246570C01 246629C03 270982C01) set ngen=/c7/eugenes/daphnia/genomes/Daphnia_pulex/tilepatharrays/ foreach nid ($nidset) ls -l /c7/eugenes/daphnia/jeochoi/${nid}_HX1_U01_Drosophila_{532,635}.gff cat /c7/eugenes/daphnia/jeochoi/${nid}_HX1_U01_Drosophila_{532,635}.gff|\ $ngen/nimgen2gff.pl -fix | \ sort -T /c7/tmp -k1,1 -k4,4n -k5,5nr | $ngen/nimgen2gff.pl -maxscore \ > ${nid}_HX1_U01_Drosophila_max.gff end