#!/usr/bin/perl -w # # Program to extract large, well-supported recombination events from a # ClonalFrame summary file. # # Usage: [Min length = 500] [Min support = 0.5] # use strict; die "Usage [Min length = 500] [Min support = 0.5]\n" if( @ARGV < 2 || @ARGV > 4); my @seqnames; open( XMFAINPUT, $ARGV[1] ) || die "Unable to open XMFA file $ARGV[1]\n"; while( my $line = ) { last unless $line =~ /^\#/; if( $line =~ /^\#Sequence\d+File\t(.+)/ ) { my @bb = split( /[\\\/]/, $1 ); push( @seqnames, $bb[$#bb] ); } } print "Sequence names are @seqnames\n"; open( CFINPUT, $ARGV[0] ) || die "Unable to open ClonalFrame file $ARGV[0]\n"; my @cfdata = ; my @odat; # get rid of empty lines for( my $cI = 0; $cI < @cfdata; $cI++ ) { next unless length($cfdata[$cI]) > 1; push( @odat, $cfdata[$cI] ); } @cfdata = @odat; my $outfile = $ARGV[2]; $outfile .= ".mutations" unless( $outfile =~/\.mutations$/ ); print "Writing output to $outfile\n"; open( XMLOUT, ">$outfile" ) || die "Unable to open XML output file $outfile\n"; my $event_len = 500; $event_len = $ARGV[3] if( @ARGV > 3 ); my $event_sup = 0.5; $event_sup = $ARGV[4] if( @ARGV > 4 ); # read the consensus tree my $refline = 0; while( !($cfdata[$refline++]=~/^\#constree/) ){ } my $tree = $cfdata[$refline]; # read the sequence names part $refline = 0; my @names; while( !($cfdata[$refline++]=~/^\#names/) ){ } while( !($cfdata[$refline++]=~/^\#/) ){ chomp( $cfdata[$refline-1] ); push(@names, $cfdata[$refline-1] ); } print "CF names are: @names\n"; # read the reference sites part $refline = 0; my @refsites; while( !($cfdata[$refline++]=~/^\#poly/) ){ } my $prev = 1; while( !($cfdata[$refline++]=~/^\#/) ){ my $ref = $cfdata[$refline-1]; chop($ref); next unless $ref =~ /\d/; $ref = $prev if $ref == 0; $prev = $ref; push( @refsites, $ref); } my $chr_length = $refsites[$#refsites]; print "Using reference sequence length of $refsites[$#refsites]\n"; # perform tree transformations to get XML format # first strip out branch lengths # then reverse the tree so numbers are first # then replace parens with speciation tags $tree =~ s/:\d\.\d+//g; $tree = reverse($tree); $tree =~ s/(\d+)/\n\n/g; $tree =~ s/[,\(]/<\/speciation>\n/g; $tree =~ s/\)//g; my @lineage = split( /\n/, $tree ); for( my $lineI = 0; $lineI < @lineage; $lineI++ ) { $lineage[$lineI] .= "\n"; # add the newline that was stripped in the split op } my $nodes = @lineage / 3; print "There are $nodes nodes in the tree\n"; # read the consensus events $refline = 0; while( !($cfdata[$refline++]=~/^\#consevent/) ){ } for( my $nI = 0; $nI < 11; $nI++ ) { my @events = split /\s+/, $cfdata[$refline+$nI]; $prev = 0; my $event_start = -1; my $outdat; for( my $eventI = 0; $eventI < @events; $eventI++ ) { next if ($eventI%2 == 1); $event_start = $eventI if ($events[$eventI] >= $event_sup && $event_start == -1); if ($events[$eventI] < $event_sup && $event_start != -1) { if( ($eventI-$event_start)/2 > $event_len) { my $l = $refsites[$event_start/2]; my $r = $refsites[$eventI/2]; $outdat .= "\n"; } $event_start = -1; }; } for( my $lI = 0; $lI < @lineage; $lI++ ) { next unless defined($outdat); if($lineage[$lI] =~ /name="(\d+)/) { next unless $1 eq reverse($nI); $lineage[$lI+1] .= $outdat; } } } #swap in known species names for( my $lineI = 0; $lineI < @lineage; $lineI++ ) { if($lineage[$lineI] =~ /speciation name="(\d+)/ ) { my $rev = reverse($1); if($rev <= @names) { my $newname = $seqnames[$names[$rev-1]-1]; $lineage[$lineI] =~ s/speciation name="\d+/speciation name="$newname/; }else{ $lineage[$lineI] =~ s/speciation name="\d+/speciation name="Ancestor $rev/; } } } print XMLOUT qq{ }; print XMLOUT @lineage; print XMLOUT "\n\n\n\n"; exit 0;