#!/usr/bin/perl # # A hacky implementation of a genetic algorithm which looks for overrpresented motifs in a test set (compared to a control set) # use strict; my @alphabet = ("A", "T", "G", "C", ".", "Y", "R", "S", "B", "D", "H", "V", "M", "K", "W"); # the alphabet that will make up all of the motifs in our genepool. Here we use "." instead of N (. matches anything in a regular expression) my $DEBUG = 0; # if this is set to 1 then the script outputs more debug information to STDOUT # some settings my $max_generations = 100000; # how many generations should we run for before assuming that the GA isn't going to produce a better result? my $max_genepool = 32; # how big should the genepool be, initially? my $max_pattern_length = 12; # how big should the patterns we seed the genepool with initially be? my $endpoint = 75; # what's our endpoint? See the tutorial writeup for more information my $mutate_rate = 20; # what's the chance (in 1000) of a particular motif being mutated in any one generation? my $max_loops = 250; # used when picking motifs to carry over to the next generation my $toxic_at = 2000; # we keep track of the best fitness in the genepool. If it stays stagnant then we gradually increase the mutation rate ($mutate_rate). At some point - 2000 - then we should give up as the GA obviously isn't going to produce a better result. my $check_reverse = 1; # should we check the reverse complement of sequences? my $check_dir = 1; # should we check both 5' to 3' and 3' to 5'? my $collect = 32; # how many times should we run the algorithm? (to collect only one motif, change this to 0) my %collected; # used to store solutions whose fitness exceeds the endpoint # data to use during fitness evalution my @pos = get_seqs_from_fasta("clusters_5_train.fa"); # converts the sequences in a FASTA file to an array my @neg = get_seqs_from_fasta("clusters_5_control.fa"); # ditto my @genepool = create_life(); # first seed the genepool my $current_best; # we keep track of the current best fitness in the genepool my $current_mutate_rate; # we vary the mutation rate; see description of $toxic_at, above my %fitness_cache; # we keep track of the fitnesses of motifs so that if the same motif comes up in another generation we don't have to recalculate fitness for (my $generation = 0; $generation < $max_generations; $generation++) { print STDERR "GENERATION $generation (COL ".scalar(keys(%collected)).", CMR $current_mutate_rate, POOLSIZE ".scalar(@genepool).") ==> "; # assess fitness my %fitness; foreach my $chr (@genepool) { if ($fitness_cache{$chr}) { $fitness{$chr} = $fitness_cache{$chr}; } else { $fitness{$chr} = assess_fitness($chr); $fitness_cache{$chr} = $fitness{$chr}; } print STDERR $fitness{$chr}." " if $DEBUG; if ($fitness{$chr} >= $endpoint) { # if the fitness of the motif we are currently evaluating is more than the endpoint... if (!$collect) { # if collect is 0, just print out the current motif and end the script print STDERR "\nEndpoint ($endpoint) achieved with $chr\n"; print STDERR assess_fitness($chr, 1); exit; } } } print STDERR "\n" if $DEBUG; # sort by fitness my @sorted = sort {$fitness{$b} <=> $fitness{$a}} keys(%fitness); print STDERR "Best: ".$sorted[0].", ".assess_fitness($sorted[0], 1)." (".$fitness{$sorted[0]}.")\n"; if (($collect) && ($fitness{$sorted[0]} >= $endpoint)) { # if the fitness of the motif we are currently evaluating is more than the endpoint... print STDERR "Collected another motif (".$sorted[0]."), reseeding genepool and starting afresh.\n"; $collected{$sorted[0]} = 1; if (scalar(keys(%collected)) >= $collect) { print STDERR "Collected enough motifs ($collect)."; open(LOG, ">ga_log.txt"); foreach my $key (keys(%collected)) { print LOG $key."\n"; } close(LOG); exit; } @genepool = create_life(); # start the whole process again until we've collected $collect motifs. $generation = 0; next; } if ($fitness{$sorted[0]} eq $current_best) { $current_mutate_rate++; # increase the mutation rate if the pool looks stale (we're stuck at a local maxima) if ($current_mutate_rate >= $toxic_at) { print STDERR "Pool too toxic (toxic at $toxic_at)!\n"; foreach my $chr (@genepool) {print STDERR "$chr ==> ".$fitness{$chr}."\n";} exit; } } else { $current_mutate_rate = $mutate_rate; } $current_best = $fitness{$sorted[0]}; # keep track of the current best # select half of the pool as breeders my @selected; my $loops = 0; while ((scalar(@selected) <= (scalar(@genepool) / 2)) && ($loops <= $max_loops)) { for (my $i=0; $i < scalar(@sorted); $i++) { if ((rand(100) <= $fitness{$sorted[$i]}) || ($i <= 2)) { $fitness{$sorted[$i]} = -1; # take it out of the running push(@selected, $sorted[$i]); } } $loops++; } # choose how many children we should create my $num_children = (scalar(@genepool) / 4) + rand(scalar(@genepool) / 2); if (scalar(@genepool) >= $max_genepool) { $num_children = ($num_children / 4) * 3; # reduce number of kids if there's a surplus population } # kill off the unselected motifs @genepool = @selected; # breed for (my $i=0; $i < $num_children; $i++) { my $child; # pick two parents my $parent_a = $genepool[pickrand(\@genepool)]; my $parent_b = $genepool[pickrand(\@genepool)]; # do some recombination my $half_a = length($parent_a) / 2; my $half_b = length($parent_b) / 2; if (rand(100) <= 50) {$child .= substr($parent_a, 0, $half_a);} else {$child .= substr($parent_a, $half_a);} if (rand(100) <= 50) {$child .= substr($parent_b, 0, $half_b);} else {$child .= substr($parent_b, $half_b);} push(@genepool, $child); } # mutate motifs if necessary for (my $c=0; $c < scalar(@genepool); $c++) { my $chr = $genepool[$c]; if (rand(1000) <= $current_mutate_rate) { my $type = rand(100); if ($type <= 25) { # polymorphism my $pos = int(rand(length($chr) - 1)); my $newpos = pickrand(\@alphabet); my $oldpos = substr($chr, $pos, 1, $newpos); print STDERR "Mutation: replaced $oldpos with $newpos at pos $pos, chr $c\n" if $DEBUG; $genepool[$c] = $chr; # update the genepool } elsif ($type <= 50) { # insertion my $pos = int(rand(length($chr) - 1)); # insert after this base my $newpos = pickrand(\@alphabet); my $existing_pos = substr($chr, $pos, 1); my $oldpos = substr($chr, $pos, 1, $existing_pos.$newpos); print STDERR "Mutation: added $newpos after $existing_pos at pos $pos, chr $c\n" if $DEBUG; $genepool[$c] = $chr; # update the genepool } elsif ($type <= 75) { # deletion my $pos = int(rand(length($chr) - 1)); my $oldpos = substr($chr, $pos, 1, "x"); $chr =~ s/x//g; print STDERR "Mutation: deleted base $oldpos at pos $pos, chr $c\n" if $DEBUG; $genepool[$c] = $chr; # update the genepool } else { # gross aberration (inversion, deletion, duplication) my $gab = rand(100); if ($gab <= 33) { # inversion my $pos = int(rand(length($chr) - 1)); my $maxlen = length(substr($chr, $pos)); my $randlen = int(rand($maxlen)) + 1; my $oldseq = substr($chr, $pos, $randlen); my $was = substr($chr, $pos, $randlen, reverse_dna($oldseq)); print STDERR "Mutation: gross inversion from pos $pos of len $randlen (".$was." to ".reverse_dna($oldseq)."), chr $c\n" if $DEBUG; $genepool[$c] = $chr; # update the genepool } elsif ($gab <= 66) { # deletion my $pos = int(rand(length($chr) - 1)); my $maxlen = length(substr($chr, $pos)); my $randlen = int(rand($maxlen)) + 1; my $oldseq = substr($chr, $pos, $randlen); my $was = substr($chr, $pos, $randlen, ""); print STDERR "Mutation: gross deletion from pos $pos of len $randlen (was len ".length($genepool[$c])." now ".length($chr)."), chr $c\n" if $DEBUG; $genepool[$c] = $chr; # update the genepool } else { # duplication my $pos = int(rand(length($chr) - 1)); my $maxlen = length(substr($chr, $pos)); my $randlen = int(rand($maxlen)) + 1; my $oldseq = substr($chr, $pos, $randlen); my $was = substr($chr, $pos, $randlen, $oldseq.$oldseq); print STDERR "Mutation: gross duplication of pos $pos, len $randlen ($oldseq) on chr $c\n" if $DEBUG; $genepool[$c] = $chr; # update the genepool } } } } } # picks an element from an array at random (can't remember why this is so complex!) sub pickrand { my @list = @{$_[0]}; my $rand = rand(1000); my $num = scalar(@list); my $binsize = 1000 / $num; my $bin = int($rand / $binsize); if ($list[$bin]) {return $list[$bin];} else {return pickrand(\@list);} } sub assess_fitness { my $pattern = $_[0]; my $detailed = $_[1]; # count how many times the pattern occurs in the positive set vs. the negative set. # return the ratio. # check the reverse complement of the sequence, too. # if we're collecting, flatten the fitness of any seqs we've seen before. if ($collected{$pattern}) {return 0;} # don't let empty patterns count if (length($pattern) <= 0) {return 0;} my $cpos = count($pattern, \@pos); my $cneg = count($pattern, \@neg); my $pos = ($cpos / scalar(@pos)) * 100; my $neg = ($cneg / scalar(@neg)) * 100; if ($detailed) { return sprintf("pos %d %d%%, neg %d %d%%", $cpos, $pos, $cneg, $neg); } else { return sprintf("%f", $pos - $neg); # as high as possible out of 100 would be nice. Min -100, max 100 } } sub count { my $pattern = $_[0]; my @seqs = @{$_[1]}; my %seen; foreach my $seq (@seqs) { if (rawcount($pattern, $seq)) {$seen{$seq} = 1; next;} if ($check_reverse) { my $revseq = reverse_dna($seq); if (rawcount($pattern, $revseq)) {$seen{$seq} = 1; next;} } if ($check_dir) { my $dirseq = reverse($seq); if (rawcount($pattern, $dirseq)) {$seen{$seq} = 1; next;} if ($check_reverse) { my $revseq = reverse(reverse_dna($seq)); if (rawcount($pattern, $revseq)) {$seen{$seq} = 1; next;} } } } #print STDERR "Counted ".scalar(keys(%seen))." of $pattern\n"; return scalar(keys(%seen)); } sub average { my @list = @{$_[0]}; my $average; foreach my $item (@list) {$average += $item;} return $average / scalar(@list); } sub rawcount { my $pattern = $_[0]; my $seq = $_[1]; # convert our motif into a regular expression $pattern =~ s/R/\[AG\]/g; $pattern =~ s/Y/\[CT\]/g; $pattern =~ s/M/\[CA\]/g; $pattern =~ s/K/\[TG\]/g; $pattern =~ s/W/\[TA\]/g; $pattern =~ s/S/\[CG\]/g; $pattern =~ s/B/\[CTG\]/g; $pattern =~ s/D/\[ATG\]/g; $pattern =~ s/H/\[ATC\]/g; $pattern =~ s/V/\[ACG\]/g; $pattern =~ s/N/\[ATCG\]/g; $pattern =~ s/2/\.\{1,2\}/g; $pattern =~ s/3/\.\{1,3\}/g; $pattern =~ s/4/\.\{1,4\}/g; $pattern =~ s/5/\.\{1,5\}/g; $pattern =~ s/6/\.\{1,6\}/g; my $count = 0; # try to match the regular expression while ($seq =~ /$pattern/g) { $count++; } return $count; } sub reverse_dna { my $string = $_[0]; $string =~ tr/[A,T,G,C]/[T,A,C,G]/; $string =~ tr/[R,Y,M,K,W,S]/[Y,R,K,M,S,W]/; $string =~ tr/[B,D,H,V]/[V,H,D,B]/; return reverse($string); } # this sub creates a new genepool, filling it with $max_genepool random motifs of max length $max_pattern_length sub create_life { my @genepool; for (my $i=0; $i < $max_genepool; $i++) { # what length of pattern to generate? my $len = rand($max_pattern_length) + 1; my $pattern; for (my $k=0; $k < $len; $k++) { $pattern .= pickrand(\@alphabet); } push(@genepool, $pattern); } return @genepool; } # simple helper function to convert a FASTA file to an array sub get_seqs_from_fasta { my $filename = $_[0]; my $seq_size = $_[1]; open(INPUT, $filename) or die("Couldn't open $filename (did you download them?)"); my @lines = ; close(INPUT); my $lines = "@lines"; my @seqs; my @fasta = split(/>/, $lines); foreach my $seq (@fasta) { my @elements = split(/\n/, $seq); $elements[0] = ""; $seq = "@elements"; $seq =~ s/\s//g; if (length($seq) <= 0) {next;} $seq = uc($seq); # only return uppercase (for the index function) if ($seq_size) { if (length($seq) <= $seq_size) {next;} if (length($seq) > $seq_size) { my $extra = length($seq) - $seq_size; $seq = substr($seq, int($extra / 2), $seq_size); } } if (scalar(@seqs) >= 50) {next;} push(@seqs, uc($seq)); } if (!scalar(@seqs)) {die("No sequences in file $filename");} return @seqs; }