/* * Name: Geoff Kuenning * Course: CS 70, Spring 2001 * Assignment #7 * * Copyright 2001, Geoff Kuenning. * * This file contains the main program for the assignment. * * The program implements a very simple genetic algorithm, using * straightforward (but inefficient) data structures. The algorithm * finds square roots of positive integers through a process of * natural selection. * * Usage: * * assign_07 [-d] [-g n] [-m n] [-p n] [-s n] [-S n] number ... * * In its simplest form, the program is invoked with a one or more * positive integer arguments. For each argument, a random population * is generated and an evolutionary simulation searches for a value * that, when squared, is as close to the given integer as possible. * * Switches can be used to modify the program behavior as follows: * * -d Produce debugging output, showing the best individual * and the generation number each time a better * individual is found. * -g n Specify the number of generations to run for (default 100). * -m n Specify the probability of genetic mutation (default * 0.001). Each time a new organism is generated, it is * mutated with a probability equal to this value. If * the organism mutates, a single randomly selected digit * is changed to a new value from 0 to 9, with uniform * distribution. * -p n Specify the size of the population of organisms * (default 100). This is the number of organisms that * compete to survive into the next generation. * -s n Specify the size of the selection pool (default 100). * This is the number of organisms that survive into the * next generation. Note that surviving gives an * organism a chance to become a parent, but does not * guarantee it. The difference between the -p value * and the -s value is the number of new organisms * produced at the beginning of each generation. * -S n Specify a seed for the random-number generator. If no * random seed is given, one is derived from the time of * day. * * Each organism in the gene pool is an integer, represented for * convenience as a linked list of decimal digits. The representation * is very handy for doing genetic crossover and mutation, and is easy * to understand, but it is quite inconvenient for use in squaring a * number. The reader should note that the representation is quite * inefficient in both space and time. Serious users of genetic * algorithms put great effort into efficient representations, because * they may simulate many hundreds of thousands of generations, each * with thousands or millions of organisms. * * Many other aspects of this program are simplified in other ways. * The program should not be taken as an example of how to implement a * genetic algorithm correctly, although it can be useful in * understanding the general method. */ #include "intlist.hh" #include #include // For DBL_MAX #include #include #include #include /* * Table of contents: the following functions are defined in this file */ int main(int argc, char* argv[]); // Main genetic algorithm static int processOptions(int argc, char* argv[], bool& debug, int& genePoolSize, int& selectionPoolSize, double& mutationProbability, int& maxRandomSurvivors, int& maxGenerations); // Process option arguments static void usage(char* programName); // Issue a usage message static void setRandomSeed(); // Set a "random" random seed static void findRoot(double target, bool debug, int genePoolSize, int selectionPoolSize, double mutationProbability, int maxRandomSurvivors, int maxGenerations); // Find square root genetically IntList* makeGenePool(int size); // Create a pool of organisms IntList newOrganism(); // Create a single new organism int randomInt(int maximum); // Generate a random integer void naturalSelection(IntList* genePool, int poolSize, int maxRandomSurvivors); // Perform natural selection int compareFitness(const void* first, const void* second); // Compare fitness of two organisms double fitness(const IntList& organism); // Calculate fitness of an organism IntList* findBest(IntList* genePool, int poolSize); // Find the best organism in the pool void mutate(IntList& target, int position, int newGene); // Do the mutation operation (in place) IntList crossover(const IntList& parent1, const IntList& parent2, int position); // Do the combination operation double intListToDouble(const IntList& toConvert); // Convert an IntList to an integer. // ..Functionality is limited, see // ..header comments on function. /* * The random-seed function needs to know the size of the machine * word. It also needs to run the pseudorandom-number generator * (PRNG) for a few cycles to make sure it has begun a unique * sequence. These two constants define those values. Note that we * implicitly assume 8 bits to the byte; this is true on any modern * machine. */ const unsigned int LONG_BITS = sizeof(long) * 8; // Number of bits in a long const unsigned int RAN_ITERS = 10; // No. of iterations in randomize loop /* * Defaults for switch-selectable arguments. */ const int GENE_POOL_SIZE = 1000; // -p: Number of organisms in // ..population const int MAX_GENERATIONS = 50; // -g: Number of generations const int MAX_RANDOM_SURVIVORS = GENE_POOL_SIZE / 10; // -r: Number of random survivors to // ..next generation const double MUTATION_PROBABILITY = 0.001; // -m: Probability of mutation const int SELECTION_POOL_SIZE = GENE_POOL_SIZE / 2; // -s: Number of survivors to // ..next generation /* * Global constants of the implementation. * * The number of digits in an organism is indirectly limited by the * maximum precision that the computer can represent. We use * double-precision arithmetic to extend the precision (doubles store * more bits on most machines), but even so we are limited to about 14 * digits. Since the organism's value will be squared, that limits * the organism itself to 7 digits. */ const int ORGANISM_BASE = 10; // Number base of representation const int ORGANISM_DIGITS = 7; // Digits in an organism /* * Global variables. * * As discussed in class, global variables are generally considered to * be evil. However, there are times when they are unavoidable. In * the current case, the population is sorted with the qsort() library * routine, and because of the way that qsort works, the comparison * routine needs to access the target value without having it passed * as a parameter. One solution is to use a global variable; the * other would be to create a class, store the target value in a * static data member, and make the comparison routine be a static * member function. The first approach seems simpler and cleaner, so * we'll use a global to hold the target. The name arises because * the target value is the square of the successful organism's value. * * Even though we limit ourselves to integer roots, we store the * target value as a double to get increased precision. */ static double squaredValue; // Target value we're finding root of /* * Main driver for testing DNA recombination. This function processes * all option arguments, generates a pool of organisms, and then * applies the rules of natural selection to try to find the one that * best satisfies the fitness criterion. In this case, a fit organism * is one that approximates the square root of the passed argument. * documentation. */ int main( int argc, // Argument count char* argv[]) // Argument vector { bool debug; // True to write debugging output int genePoolSize; // Number of organisms to simulate int selectionPoolSize; // Number of survivors to next // ..generation int maxGenerations; // Number of generations to run double mutationProbability; // Probability of a mutation int maxRandomSurvivors; // Maximum number of "unfit" survivors // ..to next generation /* * Process all option arguments. Note that the last four * arguments are reference parameters, which means that the * function can modify them. The return value is the number of * the first non-option argument on the command line. */ int argNo = processOptions(argc, argv, debug, genePoolSize, selectionPoolSize, mutationProbability, maxRandomSurvivors, maxGenerations); /* * Check the arguments to be sure we have at least one target. */ if (argNo >= argc) usage(argv[0]); /* * For each target, use natural selection to find the square root. */ for ( ; argNo < argc; argNo++) findRoot(atof(argv[argNo]), debug, genePoolSize, selectionPoolSize, mutationProbability, maxRandomSurvivors, maxGenerations); return 0; } /* * Option processing. Scan through the arguments, looking at * switches. If we run into a non-switch argument, exit the function. * * If an illegal switch is detected, we will print a usage message to * cerr and terminate the program with an exit status of 2. * * The return value of this function is the number of the first * non-switch argument encountered. If there are no non-switch * arguments, the return value equals argc. * * The reader will note that this function is fairly long, even though * it does relatively little. That is generally true of * switch-processing functions, and it is not considered bad style so * long as the function limits itself to simple operations such as * setting Boolean flags. */ static int processOptions( int argc, // Argument count passed to main char* argv[], // Argument vector passed to main bool& debug, // MODIFIED: set true if debugging // ..enabled int& genePoolSize, // MODIFIED: set to number of // ..organisms to simulate int& selectionPoolSize, // MODIFIED: set to number of // ..survivors into next generation double& mutationProbability, // MODIFIED: set to probability of // ..genetic mutations int& maxRandomSurvivors, // MODIFIED: set to maximum number of // ..randomly-chosen survivors int& maxGenerations) // MODIFIED: set to number of // ..generations to simulate { /* * Before beginning the loop, we set defaults for all options. By * doing the defaults in a single place, it is easy to change the * defaults later if necessary. * * One of the defaults is the random seed. If the -s switch * appears, we can then clobber the random seed with the specified * value. */ debug = false; genePoolSize = GENE_POOL_SIZE; selectionPoolSize = SELECTION_POOL_SIZE; maxGenerations = MAX_GENERATIONS; mutationProbability = MUTATION_PROBABILITY; maxRandomSurvivors = MAX_RANDOM_SURVIVORS; setRandomSeed(); // Pick a seed for the PRNG /* * Note that the loop index is modified inside the body of * the loop; this is generally poor style but is common in * argument processing. In this case, the modification is done * inside the processing of the "s" switch. */ int argNo; for (argNo = 1; argNo < argc; argNo++) { if (argv[argNo][0] != '-') // End of switches? If so, exit break; // ..option processing /* * By convention, the special switch option "--" means that * all following arguments are non-option arguments, even if * they start with a hyphen. If we run into that switch, we * must skip over it but then exit the option-processing loop. * * We use C++ strings in the comparison here primarily for * illustrative purposes. Note that we must explicitly * typecast one of the arguments of the == operator to * "string" or the comparison won't work -- and we WILL NOT * get any compiler error messages! */ if (argv[argNo] == string("--")) { argNo++; // Skip over the -- break; // Go to non-option argument processing } /* * All other switch options should consist of a single hyphen, * an alphabetic character, and nothing else. At this point * we know that the hyphen exists. Check to make sure that * there is exactly one character following it. */ if (argv[argNo][2] != '\0') usage(argv[0]); /* * Process switch options. We do this in a "switch" statement * so that it is easy to add new options later. */ switch (argv[argNo][1]) { case 'd': // -d: Turn on debugging debug = true; break; case 'g': // -g n: specify generations /* * Processing an option that has an argument is a bit * tricky. First, we must make sure that there is * actually a following argument. Second, we must * process that argument appropriately (in this case, * by converting it to an integer and saving it in * maxGenerations). Third, we must make sure that the * option-processing loop does not try to treat the * argument as a switch itself. We do that by * incrementing the loop index, as noted in the * comments before the loop. The first and third * operations are combined. */ if (++argNo >= argc) // Make sure there's an argument usage(argv[0]); // ..if not, complain and exit /* * Convert the next argument to an integer (atoi) and * save it. If the next argument isn't an integer, * atoi will probably return zero. But that's OK, * because we'll check it later. */ maxGenerations = atoi(argv[argNo]); break; case 'm': // -m n: specify mutation probability if (++argNo >= argc) // Make sure there's an argument usage(argv[0]); // ..if not, complain and exit mutationProbability = atof(argv[argNo]); break; case 'p': // -p n: specify gene pool size if (++argNo >= argc) // Make sure there's an argument usage(argv[0]); // ..if not, complain and exit genePoolSize = atoi(argv[argNo]); break; case 'r': // -r n: specify max random survivors if (++argNo >= argc) // Make sure there's an argument usage(argv[0]); // ..if not, complain and exit maxRandomSurvivors = atoi(argv[argNo]); break; case 's': // -s n: specify selection pool size if (++argNo >= argc) // Make sure there's an argument usage(argv[0]); // ..if not, complain and exit selectionPoolSize = atoi(argv[argNo]); break; case 'S': // -S n: specify random seed if (++argNo >= argc) // Make sure there's an argument usage(argv[0]); // ..if not, complain and exit /* * Convert the next argument to a long (atol) and pass * it to srand48. If the next argument isn't an * integer, atol will probably return zero. But * that's OK, because the behavior of the program will * still be reproducible. */ srand48(atol(argv[argNo])); break; default: // Default: unrecognized option usage(argv[0]); break; } } /* * Check the legality of invocation. Now that we have extracted * all of the user's options, make sure that they are legal and * that they are consistent with each other. If any problems are * detected, either abort the program or correct them. */ if (maxGenerations <= 0) { cerr << "Number of generations must be positive.\n"; usage(argv[0]); } if (genePoolSize <= 0) { cerr << "Gene pool size must be positive.\n"; usage(argv[0]); } if (selectionPoolSize > genePoolSize) { cerr << "Selection pool cannot be bigger than gene pool, reducing it to " << genePoolSize << endl; selectionPoolSize = genePoolSize; } /* * Argument processing is done. The calling function needs to * know how many arguments we swallowed so that it can tell where * the non-option arguments begin. So we return the number of the * first non-option argument. */ return argNo; } /* * Print a usage message and exit. As is conventional, the exit * status is 2 if there is a usage error. */ static void usage( char* programName) // Name we were run under { cerr << "Usage:\t" << programName << " [-d] [-g n] [-m n] [-p n] [-r n] [-s n] [-S n] number ...\n"; cerr << "Switches:\n"; cerr << "\t-d\tProduce debugging outout.\n"; cerr << "\t-g n\tGive number of generations to run for.\n"; cerr << "\t-m n\tGive probability of mutation.\n"; cerr << "\t-p n\tGive population size.\n"; cerr << "\t-r n\tGive maximum number of randomly-chosen survivors.\n"; cerr << "\t-s n\tGive selection pool size (number of survivors).\n"; cerr << "\t-S n\tSet random seed\n"; exit(2); } /* * Set up a "random" random seed. This routine works by using the * time of day to initialize the pseudorandom number generator. Then, * since the time of day tends to be a fairly predictable number, it * cycles the PRNG a few times to make sure that it has entered a * unique part of its sequence. * * On a machine that supports /dev/random, a better way to initialize * the PRNG would be to read from that device. However, doing so would * complicate the code without adding instructional value. */ static void setRandomSeed() { /* * Get the time of day in seconds and microseconds. Note that the * system clock may not have microsecond resolution, so there is * no guarantee that all of the microsecond bits are significant */ struct timeval tv; // Time of day, for getting usecs struct timezone tz; // Dummy for gettimeofday (void) gettimeofday (&tv, &tz); /* * Calculate a 48-bit random seed, and use it (seed48) to * initialize the random-number generator. The seed is composed * of the Unix time, in microsecond resolution, converted to a * 48-bit number. We would have to do some pretty hairy integer * arithmetic with 32-bit numbers to get a correct seed value, and * frankly it's just not worth it. (This is a case where access * to assembly language would greatly simplify the task, but that * would be very non-portable.) That leaves us with two choices: * (1) use the "long long" type that g++ provides, but which is * not part of the C++ standard (yet), or (2) use double-precision * arithmetic to calculate intermediate results. The latter * option has the advantage that a double supports 56 bits in its * mantissa, so it can handle the problem, but it is also * portable. We'll do it that way. * * However, in using double-precision arithmetic we must be * careful to avoid variations in the way different machines * convert numbers that are larger than the maximum integer. Some * truncate; others set the result to the largest integer. So * we're careful to be sure that our results are never larger than * the largest representable integer. * * It's worth noting that the microsecond-adjusted time is * actually about 52 bits wide (the Unix time is up to 32 bits, * and the microseconds take just a hair under 20 bits). Thus, * the first truncation must discard the top 4 bits. That's not a * problem, since those bits are the most slowly varying of all. * * Also note that in setting seed16[2], we couldn't use a single * integral divisor because its value would overflow the size of a * long. That's why we use a const double and divide twice. An * alternative would be to type in the value of 2^32 by hand as a * double, which would be easy but slightly less readable. Since * this routine only executes once, the time lost in doing the * calculation dynamically is irrelevant. */ double wideSeed = (unsigned long)tv.tv_sec * 1000000.0 + tv.tv_usec; unsigned short seed16[3]; const double divisor = 1ul << 16; unsigned long topBits = (unsigned long)(wideSeed / divisor / divisor); seed16[2] = (unsigned short)topBits; wideSeed -= topBits * divisor * divisor; seed16[1] = (unsigned short)(wideSeed / divisor); wideSeed -= seed16[1] * divisor; seed16[0] = (unsigned short)wideSeed; seed48(seed16); /* * The above calculation will tend to generate a random seed that * has predictable values in some bits and unpredictable values in * others. Cycle the PRNG a few times so that we will have * departed from the predictable part of the sequence. */ for (unsigned int i = 0; i < RAN_ITERS; i++) drand48(); } /* * Use a genetic algorithm to find a square root. The algorithm * operates by repeatedly generating "organisms" that are then tested * for "fitness". In this case, a fit organism is one whose value, * when squared, is close to the target. In each generation, genetic * crossover and mutation are used to produce new organisms. In * addition, the fittest organisms survive unchanged into the next * generation. */ static void findRoot( double target, // Number to find square root of bool debug, // Produce debugging information int genePoolSize, // Size of gene pool (organisms) int selectionPoolSize, // Size of selection pool (see below) double mutationProbability, // Probability of a mutation int maxRandomSurvivors, // Maximum number of "unfit" survivors // ..to next generation int maxGenerations) // Number of generations to run for { /* * Because we use qsort to sort the gene pool by fitness, the * target value must be available in a global variable. See the * comments at the declaration of squaredValue near the top of the * file. */ squaredValue = target; /* * Start with a random population of the chosen size. */ IntList* genePool = makeGenePool(genePoolSize); /* * If we are debugging, we will print out the best candidate each * time a new one is found. To do this, we must track the best * fitness found so far. We start with the best set to the * largest possible number so that anything we find will be better. */ double bestFitness = DBL_MAX; /* * Simulate the requested number of generations */ for (int generation = 0; generation < maxGenerations; generation++) { /* * Choose the candidates who will survive until the next * generation and put them at the front of the gene pool. */ naturalSelection(genePool, genePoolSize, maxRandomSurvivors); /* * The best candidate in the pool is now at the front. If * we're debugging, report any improvement. */ if (debug) { IntList* best = findBest(genePool, genePoolSize); double newBestFitness = fitness(*best); if (newBestFitness < bestFitness) { bestFitness = newBestFitness; cerr << "Generation " << generation << ": " << *best << endl; } } /* * Fill up the remainder of the gene pool with new organisms * whose parents have been selected from the survivors list. */ for (int organism = selectionPoolSize; organism < genePoolSize; organism++) { /* * Choose two parents randomly. Although we use the terms * "father" and "mother", reproduction is actually * asexual; any two organisms can be the parents. We take * the trouble to be sure the father and mother are * actually different organisms. */ int father = randomInt(selectionPoolSize); int mother = randomInt(selectionPoolSize); while (father == mother) mother = randomInt(selectionPoolSize); /* * Cross the parents at a randomly selected place in the * gene string. Note that this is a simplified * approximation of the way real genetic crossover works. */ int crossoverPoint = randomInt(ORGANISM_DIGITS); genePool[organism] = crossover(genePool[father], genePool[mother], crossoverPoint); /* * Finally, introduce a random mutation (with small * probability). Again, this is a rough approximation of * reality. */ if (drand48() <= mutationProbability) mutate(genePool[organism], randomInt(ORGANISM_DIGITS), randomInt(ORGANISM_BASE)); } } /* * Report the best organism. We use double-precision arithmetic, * but print the results as integers. The setf and precision * calls will arrange that there is no decimal point. */ IntList* winner = findBest(genePool, genePoolSize); double root = intListToDouble(*winner); cout.setf(ios::fixed); cout.precision(0); cout << *winner << " * " << root << " = " << root * root << endl; /* * Since we allocated the gene pool with new[], we must clean it * up ourselves. */ delete[] genePool; } /* * Create an initial gene pool of randomly chosen organisms. The pool * is allocated with new[]. It is the caller's responsibility to * delete[] it. */ IntList* makeGenePool( int size) // Size of the pool to create { IntList* pool = new IntList[size]; for (int i = 0; i < size; i++) pool[i] = newOrganism(); return pool; } /* * Create a single new organism by generating a string of random * digits that is of the proper length. We create the random digits * in a fixed-length array, and then let the IntList constructor * produce the final result. */ IntList newOrganism() { int digits[ORGANISM_DIGITS]; // Space to create the string for (int i = 0; i < ORGANISM_DIGITS; i++) digits[i] = randomInt(ORGANISM_BASE); return IntList(digits, ORGANISM_DIGITS); } /* * Generate a uniformly distributed random integer between 0 * (inclusive) and a specified maximum value (exclusive). For * example, randomInt(6) will generate a number between 0 and 5, with * each value being equally probable. */ int randomInt( int maximum) // Maximum value (never generated) { return (int)(drand48() * maximum); } /* * Do natural selection by choosing the organisms that have the * greatest fitness and placing them into a preselected section of * the gene pool. * * The selection is done by the simple expedient of sorting the * organisms by fitness, with the most fit placed first, but with a * little bit of randomness thrown in. */ void naturalSelection( IntList* genePool, // Gene pool to select on int poolSize, // Size of the pool int maxRandomSurvivors) // Maximum no. of random survivors { /* * Selection boils down to just sorting. I'm old-fashioned, so * I'm going to use qsort. Frankly, I think the interface is * better than the STL sort monstrosity. See "man qsort" for more * information. */ qsort(genePool, poolSize, sizeof genePool[0], compareFitness); /* * In real genetics, the most-fit organisms sometimes don't * survive, and instead lesser-fit ones reproduce. This turns out * to be a good thing, because it introduces important variance * into the genetic pool. We'll randomize things very slightly to * simulate this behavior. * * Note that this simulation is poor in that all unfit organisms * have an equal chance to survive. A better implementation would * make the probability of survival dependent on the fitness. */ int randomSurvivors = randomInt(maxRandomSurvivors); for (int i = 0; i < randomSurvivors; i++) { int firstSlot = randomInt(poolSize); int secondSlot = randomInt(poolSize); if (firstSlot != secondSlot) swap(firstSlot, secondSlot); } } /* * Compare the fitness of two organisms, returning a result useful * with qsort: <0 means the first organism precedes the second, 0 * means they are equal, and >0 means the second organism should go * first. Because of how qsort works, the arguments are of type void* * even though they point to integer lists. Note that lower fitnesses * are considered to be better. */ int compareFitness( const void* first, // Items to compare const void* second) // ... { /* * Figure out the fitnesses of both organisms. The funny * "*(Intlist*)" notation first converts the void pointers back * into their true type (IntList*), and then dereferences those * pointers to get the actual IntList. */ double firstFitness = fitness(*(IntList*)first); double secondFitness = fitness(*(IntList*)second); /* * Figure out who goes first. We can't just return the difference * between the two fitnesses, because then we would produce the * wrong result if the difference couldn't be represented by an * integer. */ if (firstFitness < secondFitness) return -1; else if (firstFitness > secondFitness) return 1; else return 0; } /* * Calculate the fitness of an organism. The fitness is defined as * the absolute value of the difference between the organism's square * and the target value that we are trying to find the root of. In * other words, if the organism's value is 5 and the target is 36, the * fitness is 11 (i.e., 36 - 5*5). */ double fitness( const IntList& organism) // Organism to test fitness of { double organismValue = intListToDouble(organism); double square = organismValue * organismValue; return fabs(square - squaredValue); } /* * Find the best organism in a gene pool. "Best" is defined simply as * the organism with the lowest fitness. In case of ties, the * first-found is chosen. */ IntList* findBest( IntList* genePool, // Gene pool to search int poolSize) // Size of the pool { /* * Make sure we don't do an illegal reference */ if (poolSize == 0) return NULL; IntList* best = &genePool[0]; // Best so far double bestFitness = fitness(*best); for (int i = 1; i < poolSize; i++) { if (fitness(genePool[i]) < bestFitness) { best = &genePool[i]; bestFitness = fitness(*best); } } return best; } /* * Perform in-place DNA mutation. We walk through the target string, * looking for the position that is to be mutated. When we reach it, * we replace the existing gene with the new value provided. */ void mutate( IntList& target, // Gene list to be mutated int position, // Position to mutate (0-indexed) int newGene) // Gene to insert at mutation point { // ADD STUFF } /* * Perform DNA crossover. We walk through the two parent strings * simultaneously, copying one or the other to the result. The choice * of what to copy depends on the current position relevant to the * recombination position. Genes from 0 through position-1 are taken * from parent 1; genes from position through the end of the list are * taken from parent 2. Note that the length of the list is * determined implicitly. */ IntList crossover( const IntList& parent1, // First parent's gene list const IntList& parent2, // Second parent's gene list int position) // Where to switch from first to second { IntList result; // Where we'll build the crossover // ADD STUFF return result; } /* * Convert an integer list back into a plain integer. This only works * if the integer list contains values in the range 0-9. It assumes * that the integer list represents a decimal number. * * The return value is a double because it allows us to represent more * digits. An alternative would be a long long, but that's not standard yet. */ double intListToDouble( const IntList& toConvert) // List to be converted { double result = 0; for (IntListIterator i(toConvert); i; i++) result = result * ORGANISM_BASE + *i; return result; }