This programme uses a genetic algorithm to get the string "Hello, World!"
. It can be summarized as follows:
- Create a random initial population.
- Let the best reproduce and kill the worst until we get a 'good_enough' result.
I used doctest
to verify correctness.
import doctest
import random
def random_char(start=' ',end='|'):
"""
Returns a random char from ' ' [SPACE] to '|'.
>>> random.seed('EXAMPLE')
>>> random_char()
'v'
"""
return chr(random.randint(ord(' '),ord('|')))
def random_string(length):
"""
Returns a random string os the given length.
>>> random.seed('EXAMPLE')
>>> random_string(10)
"vG5iHyPWG'"
"""
return ''.join([random_char() for _ in range(length)])
def create_strings_pool(length,number):
"""
Creates a 'pool' (list) of number strings of the
given length.
>>> random.seed('EXAMPLE')
>>> create_strings_pool(3,4)
['vG5', 'iHy', 'PWG', "'TH"]
"""
return [random_string(length) for _ in range(number)]
def char_difference(char_1,char_2):
"""
Return the absolute difference between the ASCII values
of two chars.
>>> char_difference('a','b')
1
>>> char_difference('a','f')
5
>>> char_difference('#','A')
30
"""
return abs(ord(char_1) - ord(char_2))
def fitness(candidate,result):
"""
Detrmines how much similar the candidate string is
compared to the result. Char distance is counted:
ex. 'cbr' is more similar to 'car' than 'ccr'
because 'b' is nearer to 'a' then 'c'.
>>> fitness('Hfllo','Hello')
1
>>> fitness('! QWE','World')
218
>>> fitness('ccc','abc')
3
"""
fitness = 0
for index,candidate_char in enumerate(candidate):
result_char = result[index]
fitness += char_difference(candidate_char,result_char)
return fitness
def sort_by_fitness(population,result):
"""
Returns a list where the most fit elements are at the start.
>>> sort_by_fitness(['acc','add','asd','acb','aaa','!!!'],'abc')
['acc', 'acb', 'add', 'aaa', 'asd', '!!!']
"""
return list(sorted(population, key = lambda x: fitness(x,result)))
def take_fittest(string_pool,result,elite_number):
"""
Returns tphe elite_number most fit individuals.
>>> take_fittest(['acc','add','asd','acb','aaa','!!!'],'abc',3)
['acc', 'acb', 'add']
"""
sorted_string_pool = sort_by_fitness(string_pool,result)
return sorted_string_pool[:elite_number]
def kill_less_fit(pool,result,must_kill_number):
"""
Returns all the individual but the kill_number worst ones.
>>> kill_less_fit(['acc','add','asd','acb','aaa','!!!'],'abc',2)
['acc', 'acb', 'add', 'aaa']
"""
sorted_string_pool = sort_by_fitness(pool,result)
return sorted_string_pool[:-must_kill_number]
def crossover(father,mother):
"""
Generates a child by crossover, that is mixing the father and
the mother.
>>> random.seed('EXAMPLE')
>>> crossover('Hello','Noble')
'Noble'
# Weird, usually gives a mix.
"""
son = []
for index,father_char in enumerate(father):
mother_char = mother[index]
if random.random() < 0.50:
son.append(father_char)
else:
son.append(mother_char)
return ''.join(son)
def mutate(string,mutation_rate):
"""
Returns a string where each character has (1 - mutation_rate * 0.9) probability
of being randomly changed.
>>> random.seed("EXAMPLE")
>>> mutate("Hello my dear, how are you?",0.7)
"5HPW' my dDarz h7W =re you?"
"""
return ''.join([i if random.random() < (mutation_rate*0.9) else random_char() for i in string])
def make_child(father,mother,mutation_rate):
"""
Generates a child from mother and father using the crossing_over
and then mutates it accordingly to mutation_rate.
>>> random.seed("EXAMPLE")
>>> make_child("Hello","Hulli",0.6)
'Hulli'
"""
off_spring = crossover(father,mother)
if random.random() < mutation_rate:
return mutate(off_spring,mutation_rate)
else:
return off_spring
def genetic_process(result, pool_size, fitness_treshold, mutation_rate,
elite_number, max_generations, logging):
"""
Simulates a gentic evolution.
1) A random population is created
2) The most fit individuals get to reproduce and the worst are killed
until a 'good_enough' individual is born.
Parametres:
@ result: The target string you want to obtain.
@ pool_size: The size of the starting population.
@ fitness_treshold: How much good you want the solution:
0 means you want a perfect solution, a high number
means that even a far solution is ok.
@ mutation_rate: The probability of random mutations happening:
0 means no random mutations
1 means many random mutations
@ elite_number: The number of best individuals that get to reproduce
each generation.
@ max_generations: The maximum number of generations allowed. If a
solution is not found in that many generations, the best individual
so far is returned.
@ logging: True if you want this evolution to print regular output,
else False.
Return value:
@ A string that is inside the fitness_treshold.
>>> random.seed("Example")
>>> genetic_process("ABC", 20, 5, 0.8, 6, 4, True)
Generation number: 0.
So far the best individual is @E2 with a fitness of 21.
The top five is ['@E2', 'H3I', '@Ed', '@@ ', 'A1*'].
Generation number: 1.
So far the best individual is HEI with a fitness of 16.
The top five is ['HEI', 'AE2', '@E2', '@E2', 'H3I'].
Generation number: 2.
So far the best individual is HE@ with a fitness of 13.
The top five is ['HE@', 'HEI', 'AES', 'AE2', '@E2'].
Generation number: 3.
So far the best individual is AEI with a fitness of 9.
The top five is ['AEI', 'HE@', 'HEI', 'AES', 'AES'].
['AEI', 'HE@', 'HEI', 'AES', 'AES']
"""
pool = create_strings_pool(len(result),pool_size)
pool = sort_by_fitness(pool,result)
for generation_number in range(max_generations):
best_individuals = take_fittest(pool,result,elite_number)
# Sometimes I select the second and the third for greater genetic variability
the_best = random.choice(best_individuals[:3])
for individual in best_individuals:
pool.append(make_child(the_best,individual,mutation_rate))
pool = kill_less_fit(pool,result,elite_number)
if logging:
print("""Generation number: {}.
So far the best individual is {} with a fitness of {}.
The top five is {}.\n""".format(
generation_number + 1, take_fittest(pool,result,1)[0],
fitness(take_fittest(pool,result,1)[0],result),
take_fittest(pool,result,5)))
if fitness(take_fittest(pool,result,1)[0],result) < fitness_treshold:
break
return take_fittest(pool,result,5)
def main():
""" Example of use of the genetic algorithm. """
TARGET = "Hello, World!"
POOL_SIZE = 100
FITNESS_TRESHOLD = 4
MUTATION_RATE = 1
ELITE_NUMBER = 20
NUMBER_OF_GENERATIONS = 10**6
LOGGING = True
print((genetic_process(TARGET,POOL_SIZE,FITNESS_TRESHOLD,
MUTATION_RATE,ELITE_NUMBER,
NUMBER_OF_GENERATIONS, LOGGING)))
if __name__ == "__main__":
doctest.testmod()
main()
-
6\$\begingroup\$ Wow, you really spent some time on this. And your documentation is awesome. Kudos! \$\endgroup\$Phrancis– Phrancis2015年01月18日 22:39:59 +00:00Commented Jan 18, 2015 at 22:39
2 Answers 2
There seems to be some confusion around
mutation_rate
. Firstly, the documentation@ mutation_rate: The probability of random mutations happening: 0 means no random mutations 1 means many random mutations
conflicts
each character has (1 - mutation_rate * 0.9) probability of being randomly changed
Secondly,
mutation_rate
is used in two places: inmake_child
to decide whether to callmutate
, and inmutate
to decide whether to mutate each character. I'm not familiar with genetic programming practices so I can't say if that is a typical approach, but in any case I would expect to be able to specify the two probabilities separately.- There are some magic numbers buried in the code, such as the 0.9 above, and the 3 in
genetic_process
. These should be parameters. - The function
kill_less_fit
seems redundant as one could instead usetake_fittest(pool, result, pool_size)
. In
fitness
andcrossover
code like thisfor index,candidate_char in enumerate(candidate): result_char = result[index]
could be simplified to this
for candidate_char, result_char in zip(candidate, result):
which gives me the idea to simplify
fitness
further toreturn sum(char_difference(candidate_char,result_char) for candidate_char, result_char in zip(candidate, result))
It is necessary to consult an ASCII table to find out what characters
random_char
can return. It would be clearer and also more flexible to use a string argument that includes all allowed characters.
Python comes with "batteries included", and you can simplify your code by using them. For example:
from difflib import SequenceMatcher
def fitness(candidate, result):
"""Determine how similar the candidate and result strings are.
>>> fitness('Hfllo','Hello')
0.8
>>> fitness('! QWE','World')
0.2
>>> fitness('ccc','abc')
0.3333333333333333
"""
return SequenceMatcher(None, candidate, result).ratio()
Note that the ratio is always between 0
and 1
, and you would need to invert the ratio or your sorting (reverse=True
) as low numbers are now worse.
Rather than playing with ord
and chr
, I would use the constants defined by string
:
>>> import string
>>> string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
>>> string.digits
'0123456789'
>>> string.ascii_uppercase
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
>>> string.ascii_lowercase
'abcdefghijklmnopqrstuvwxyz'
You could define e.g.
CHARACTERS = "".join([" ", string.punctuation, string.digits,
string.ascii_uppercase, string.ascii_lowercase])
then simplify your random_char
function:
def random_char():
"""Provide a randomly-selected character.
>>> random.seed(0)
>>> for _ in range(5):
random_char()
'l'
'd'
'6'
'\\'
'F'
"""
return random.choice(CHARACTERS)
crossover
could also be much simpler:
def crossover(father, mother, mutation):
"""Return a combination of father and mother with random mutations.
>>> random.seed(0)
>>> crossover("hello", "world", 0)
'welld'
>>> crossover("hello", "world", 0.5)
'wclld'
>>> crossover("hello", "world", 1)
'aV)Pw'
"""
return "".join([random.choice(pair) if random.random() > mutation
else random_char() for pair in zip(father, mother)])
I would move the constants from main
up to the top of the script, so it's easier for someone to open the script and quickly tweak them. Then you can remove main
entirely, and just call genetic_process
from the if __name__ == '__main__'
block.