/* * Program: pgn-extract: a Portable Game Notation (PGN) extractor. * Copyright (C) 1994-2001 David Barnes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 1, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * David Barnes may be contacted as D.J.Barnes@ukc.ac.uk * http://www.cs.ukc.ac.uk/people/staff/djb/ * */ #include #include #include #include #include "bool.h" #include "defs.h" #include "typedef.h" #include "lines.h" #include "taglist.h" #include "tokens.h" #include "lex.h" #include "moves.h" #include "end.h" #include "eco.h" #include "argsfile.h" #include "apply.h" #include "output.h" #include "lists.h" #define CURRENT_VERSION "(v15.0)" #define URL "http://www.cs.ukc.ac.uk/people/staff/djb/pgn-extract/" /* The prefix of the arguments allowed in an argsfile. * The full format is: * :-? * where ? is an argument letter. * * A line of the form: * :filename * means use filename as a NORMALFILE source of games. * * A line with no leading colon character is taken to apply to the * move-reason argument line. Currently, this only applies to the * -t -v -x -z * arguments. */ static const char argument_prefix[] = ":-"; static const int argument_prefix_len = sizeof(argument_prefix)-1; static ArgType classify_arg(const char *line); /* Select the correct function according to operating system. */ static int stringcompare(const char *s1, const char *s2) { #if defined(__unix__) || defined(__linux__) return strcasecmp(s1,s2); #else return stricmp(s1,s2); #endif } /* Print a usage message, and exit. */ static void usage_and_exit(unsigned level) { const char **help_data; const char *level_0_help[] = { "-aoutputfile -- append extracted games to outputfile. (See -o).", "-Aargsfile -- read the program's arguments from argsfile.", "-dduplicates -- write duplicate games to the file duplicates.", "-D -- don't output duplicate games.", "-ffile_list -- file_list contains the list of PGN source files, one per line.", "-h1 -- print details of further arguments.", "-llogfile -- Save the diagnostics in logfile rather than using stderr.", "-ooutputfile -- write extracted games to outputfile (existing contents lost).", "-r -- report any errors but don't extract.", "-s -- silent mode: don't report each game as it is extracted.", "-ttagfile -- file of tag extraction criteria.", "-Tcriterion -- player, date, or result extraction criterion.", "-U -- don't output games that only occur once. (See -d).", "-vvariations -- the file variations contains the textual lines of interest.", "-xvariations -- the file variations contains the lines resulting in", " positions of interest.", /* Must be NULL terminated. */ (char *)NULL, }; const char *level_1_help[] = { "-7 -- output only the seven tag roster for each game. Other tags (apart", " from FEN and possibly ECO) are discarded (See -e).", "-b[elu]num -- restricted bounds on the number of moves in a game.", " lnum set a lower bound of `num' moves,", " unum set an upper bound of `num' moves,", " otherwise num (or enum) means equal-to `num' moves.", "-cfile[.pgn] -- Use file.pgn as a check-file for duplicates or", " contents of file (no pgn suffix) as a list of check-file names.", "-C -- don't include comments in the output. Ordinarily these are retained.", "-eECO_file -- perform ECO classification of games. The optional", " ECO_file should contain a PGN format list of ECO lines", " Default is to use eco.pgn from the current directory.", "-E[123 etc.] -- split output into separate files according to ECO.", " E1 : Produce files from ECO letter, A.pgn, B.pgn, ...", " E2 : Produce files from ECO letter and first digit, A0.pgn, ...", " E3 : Produce files from full ECO code, A00.pgn, A01.pgn, ...", " Further digits may be used to produce non-standard further", " refined division of games.", " All files are opened in append mode.", "-F -- output a FEN string comment of the final game position.", "-h -- print details of the main arguments.", "-Llogfile -- Append all diagnostics to logfile, rather than overwriting.", "-noutputfile -- Write all valid games not otherwise output to outputfile.", "-N -- don't include NAGs in the output. Ordinarily these are retained.", "-P -- don't match permutations of the textual variations (-v).", "-Rtagorder -- Use the tag ordering specified in the file tagorder.", "-S -- Use a simple soundex algorithm for some tag matches. If used", " this option must precede the -t or -T options.", "-V -- don't include variations in the output. Ordinarily these are retained.", "-wwidth -- set width as an approximate line width for output.", "-W[cm|epd|halg|lalg|san] -- specify the output format to use.", " Default is SAN.", " -W means use the input format.", " -Wcm is (a possibly obsolete) ChessMaster format.", " -Wepd is EPD format.", " -Wsan[PNBRQK] for language specific output.", " -Whalg is hyphenated long algebraic.", " -Wlalg is long algebraic.", "-zendings -- the file endings contains the end positions of interest.", "-Z -- use the file virtual.tmp as an external hash table for duplicates.", " Use when MallocOrDie messages occur with big datasets.", "-#num -- output num games per file, to files named 1.pgn, 2.pgn, etc.", /* Must be NULL terminated. */ (char *)NULL, }; /* Select the correct set of help information. */ if(level == 0){ help_data = level_0_help; } else{ help_data = level_1_help; } fprintf(GlobalState.logfile, "pgn-extract %s (%s): a Portable Game Notation (PGN) manipulator.\n", CURRENT_VERSION,__DATE__); fprintf(GlobalState.logfile, "Copyright (C) 1994-2001 David J. Barnes (d.j.barnes@ukc.ac.uk)\n"); fprintf(GlobalState.logfile,"%s\n\n",URL); fprintf(GlobalState.logfile,"Usage: pgn-extract [arguments] [file.pgn ...]\n"); fprintf(GlobalState.logfile,"Partial list of arguments "); fprintf(GlobalState.logfile,"(see -%c for more):\n",HELP_ARGUMENT); for(; *help_data != NULL; help_data++){ fprintf(GlobalState.logfile,"%s\n",*help_data); } exit(1); } void read_args_file(const char *infile) { char *line; FILE *fp = fopen(infile,"r"); if(fp == NULL){ fprintf(GlobalState.logfile,"Cannot open %s for reading.\n",infile); } else{ ArgType linetype = NO_ARGUMENT_MATCH; ArgType nexttype; while((line = read_line(fp)) != NULL){ if(blank_line(line)){ (void) free(line); continue; } nexttype = classify_arg(line); if(nexttype == NO_ARGUMENT_MATCH){ if(*line == argument_prefix[0]){ /* Treat the line as a source file name. */ add_filename_to_source_list(&line[1],NORMALFILE); } else if(linetype != NO_ARGUMENT_MATCH){ /* Handle the line. */ switch(linetype){ case MOVES_ARGUMENT: add_textual_variation_from_line(line); break; case POSITIONS_ARGUMENT: add_positional_variation_from_line(line); break; case TAGS_ARGUMENT: process_tag_line(infile,line); break; case TAG_ROSTER_ARGUMENT: process_roster_line(line); break; case ENDINGS_ARGUMENT: process_ending_line(line); (void) free(line); break; default: fprintf(GlobalState.logfile, "Internal error: unknown linetype %d in read_args_file\n", linetype); (void) free(line); exit(-1); } } else{ /* It should have been a line applying to the * current linetype. */ fprintf(GlobalState.logfile, "Missing argument type for line %s in the argument file.\n", line); exit(1); } } else{ switch(nexttype){ /* Arguments with a possible additional * argument value. * All of these apply only to the current * line in the argument file. */ case WRITE_TO_OUTPUT_FILE_ARGUMENT: case APPEND_TO_OUTPUT_FILE_ARGUMENT: case WRITE_TO_LOG_FILE_ARGUMENT: case APPEND_TO_LOG_FILE_ARGUMENT: case DUPLICATES_FILE_ARGUMENT: case USE_ECO_FILE_ARGUMENT: case CHECK_FILE_ARGUMENT: case FILE_OF_FILES_ARGUMENT: case BOUNDS_ARGUMENT: case GAMES_PER_FILE_ARGUMENT: case ECO_OUTPUT_LEVEL_ARGUMENT: case FILE_OF_ARGUMENTS_ARGUMENT: case NON_MATCHING_GAMES_ARGUMENT: case TAG_EXTRACTION_ARGUMENT: case LINE_WIDTH_ARGUMENT: case OUTPUT_FORMAT_ARGUMENT: case HELP_ARGUMENT: process_argument(line[argument_prefix_len], &line[argument_prefix_len+1]); linetype = NO_ARGUMENT_MATCH; break; /* Arguments with no additional * argument value. * All of these apply only to the current * line in the argument file. */ case SEVEN_TAG_ROSTER_ARGUMENT: case ALTERNATIVE_HELP_ARGUMENT: case DONT_KEEP_COMMENTS_ARGUMENT: case DONT_KEEP_DUPLICATES_ARGUMENT: case DONT_MATCH_PERMUTATIONS_ARGUMENT: case DONT_KEEP_NAGS_ARGUMENT: case OUTPUT_FEN_STRING_ARGUMENT: case CHECK_ONLY_ARGUMENT: case KEEP_SILENT_ARGUMENT: case USE_SOUNDEX_ARGUMENT: case SUPPRESS_ORIGINALS_ARGUMENT: case DONT_KEEP_VARIATIONS_ARGUMENT: case USE_VIRTUAL_HASH_TABLE_ARGUMENT: process_argument(line[argument_prefix_len],""); linetype = NO_ARGUMENT_MATCH; break; /* Arguments whose values persist beyond * the current line. */ case MOVES_ARGUMENT: case POSITIONS_ARGUMENT: case ENDINGS_ARGUMENT: case TAGS_ARGUMENT: case TAG_ROSTER_ARGUMENT: process_argument(line[argument_prefix_len], &line[argument_prefix_len+1]); /* Apply this type to subsequent lines. */ linetype = nexttype; break; default: linetype = nexttype; break; } (void) free(line); } } (void) fclose(fp); } } /* Determine which (if any) type of argument is * indicated by the contents of the current line. * Arguments are assumed to be surrounded by * square brackets. */ static ArgType classify_arg(const char *line) { /* Valid arguments must have at least one character beyond * the prefix. */ static const int min_argument_length = 1+sizeof(argument_prefix)-1; int line_length = strlen(line); /* Check for a line of the form: * :-argument */ if((strncmp(line,argument_prefix,argument_prefix_len) == 0) && (line_length >= min_argument_length)){ char argument_letter = line[argument_prefix_len]; switch(argument_letter){ case TAGS_ARGUMENT: case MOVES_ARGUMENT: case POSITIONS_ARGUMENT: case ENDINGS_ARGUMENT: case TAG_EXTRACTION_ARGUMENT: case LINE_WIDTH_ARGUMENT: case OUTPUT_FORMAT_ARGUMENT: case SEVEN_TAG_ROSTER_ARGUMENT: case FILE_OF_ARGUMENTS_ARGUMENT: case NON_MATCHING_GAMES_ARGUMENT: case DONT_KEEP_COMMENTS_ARGUMENT: case DONT_KEEP_DUPLICATES_ARGUMENT: case DONT_KEEP_NAGS_ARGUMENT: case DONT_MATCH_PERMUTATIONS_ARGUMENT: case OUTPUT_FEN_STRING_ARGUMENT: case CHECK_ONLY_ARGUMENT: case KEEP_SILENT_ARGUMENT: case USE_SOUNDEX_ARGUMENT: case SUPPRESS_ORIGINALS_ARGUMENT: case DONT_KEEP_VARIATIONS_ARGUMENT: case WRITE_TO_OUTPUT_FILE_ARGUMENT: case WRITE_TO_LOG_FILE_ARGUMENT: case APPEND_TO_LOG_FILE_ARGUMENT: case APPEND_TO_OUTPUT_FILE_ARGUMENT: case DUPLICATES_FILE_ARGUMENT: case USE_ECO_FILE_ARGUMENT: case CHECK_FILE_ARGUMENT: case FILE_OF_FILES_ARGUMENT: case BOUNDS_ARGUMENT: case GAMES_PER_FILE_ARGUMENT: case ECO_OUTPUT_LEVEL_ARGUMENT: case HELP_ARGUMENT: case ALTERNATIVE_HELP_ARGUMENT: case TAG_ROSTER_ARGUMENT: return (ArgType) argument_letter; default: fprintf(GlobalState.logfile, "Unrecognized argument: %s in the argument file.\n", line); exit(1); return NO_ARGUMENT_MATCH; } } else{ return NO_ARGUMENT_MATCH; } } /* Process the argument letter and its associated value. * This function processes arguments from the command line and * from an argument file associated with the -A argument. * * An argument -ofile.pgn would be passed in as: * 'o' and "file.pgn". * A zero-length string for associated_value is not necessarily * an error, e.g. -e has an optional following filenname. * If the associated_value is to be used beyond this function, * it must be copied. */ void process_argument(char arg_letter,const char *associated_value) { /* Provide an alias for associated_value because it will * often represent a file name. */ const char *filename = associated_value; switch(arg_letter){ case WRITE_TO_OUTPUT_FILE_ARGUMENT: case APPEND_TO_OUTPUT_FILE_ARGUMENT: if(GlobalState.ECO_level > 0){ fprintf(GlobalState.logfile,"-%c conflicts with -E\n", arg_letter); } else if(GlobalState.games_per_file > 0){ fprintf(GlobalState.logfile,"-%c conflicts with -#\n", arg_letter); } else if(GlobalState.output_filename != NULL){ fprintf(GlobalState.logfile, "-%c: File %s has already been selected for output.\n", arg_letter,GlobalState.output_filename); exit(1); } else if(*filename == '\0'){ fprintf(GlobalState.logfile,"Usage: -%cfilename.\n",arg_letter); exit(1); } else{ if(GlobalState.outputfile != NULL){ (void) fclose(GlobalState.outputfile); } if(arg_letter == WRITE_TO_OUTPUT_FILE_ARGUMENT){ GlobalState.outputfile = MustOpen(filename,"w"); } else{ GlobalState.outputfile = MustOpen(filename,"a"); } GlobalState.output_filename = filename; } break; case WRITE_TO_LOG_FILE_ARGUMENT: case APPEND_TO_LOG_FILE_ARGUMENT: /* Take precautions against multiple log files. */ if((GlobalState.logfile != stderr) && (GlobalState.logfile != NULL)){ (void) fclose(GlobalState.logfile); } if(arg_letter == WRITE_TO_LOG_FILE_ARGUMENT){ GlobalState.logfile = fopen(filename,"w"); } else{ GlobalState.logfile = fopen(filename,"a"); } if(GlobalState.logfile == NULL){ fprintf(stderr,"Unable to open %s for writing.\n",filename); GlobalState.logfile = stderr; } break; case DUPLICATES_FILE_ARGUMENT: if(*filename == '\0'){ fprintf(GlobalState.logfile,"Usage: -%cfilename.\n",arg_letter); exit(1); } else if(GlobalState.suppress_duplicates){ fprintf(GlobalState.logfile, "-%c clashes with the -%c flag.\n",arg_letter, DONT_KEEP_DUPLICATES_ARGUMENT); exit(1); } else{ GlobalState.duplicate_file = MustOpen(filename,"w"); } break; case USE_ECO_FILE_ARGUMENT: GlobalState.add_ECO = TRUE; if(*filename != '\0'){ GlobalState.eco_file = StringCopy(filename); } else if((filename = getenv("ECO_FILE")) != NULL){ GlobalState.eco_file = filename; } else{ /* Use the default which is already set up. */ } initEcoTable(); break; case ECO_OUTPUT_LEVEL_ARGUMENT: { unsigned level; if(GlobalState.output_filename != NULL){ fprintf(GlobalState.logfile, "-%c: File %s has already been selected for output.\n", arg_letter, GlobalState.output_filename); exit(1); } else if(GlobalState.games_per_file > 0){ fprintf(GlobalState.logfile, "-%c conflicts with -#.\n", arg_letter); exit(1); } else if(sscanf(associated_value,"%u",&level) != 1){ fprintf(GlobalState.logfile, "-%c requires a number attached, e.g., -%c1.\n", arg_letter,arg_letter); exit(1); } else if((level < MIN_ECO_LEVEL) || (level > MAX_ECO_LEVEL)){ fprintf(GlobalState.logfile, "-%c level should be between %u and %u.\n", MIN_ECO_LEVEL,MAX_ECO_LEVEL,arg_letter); exit(1); } else{ GlobalState.ECO_level = level; } } break; case CHECK_FILE_ARGUMENT: if(*filename != '\0'){ /* See if it is a single PGN file, or a list * of files. */ unsigned len = strlen(filename); /* Check for a .PGN suffix. */ const char *suffix = output_file_suffix(SAN); if((len > strlen(suffix)) && (stringcompare(&filename[len-strlen(suffix)], suffix) == 0)){ add_filename_to_source_list(filename,CHECKFILE); } else{ FILE *fp = MustOpen(filename,"r"); add_filename_list_from_file(fp,CHECKFILE); (void) fclose(fp); } } break; case FILE_OF_FILES_ARGUMENT: if(*filename != '\0'){ FILE *fp = MustOpen(filename,"r"); add_filename_list_from_file(fp,NORMALFILE); (void) fclose(fp); } else{ fprintf(GlobalState.logfile,"Filename expected with -%c\n", arg_letter); } break; case BOUNDS_ARGUMENT: { /* Bounds on the number of moves are to be found. * "l#" means less-than-or-equal-to. * "g#" means greater-than-or-equal-to. * Otherwise "#" (or "e#") means that number. */ /* Equal by default. */ char which = 'e'; unsigned value; Boolean Ok = TRUE; const char *bound = associated_value; switch(*bound){ case 'l': case 'u': case 'e': which = *bound; bound++; break; default: if(!isdigit((int) *bound)){ fprintf(GlobalState.logfile, "-b must be followed by e, l, or u.\n"); Ok = FALSE; } break; } if(Ok && (sscanf(bound,"%u",&value) == 1)){ GlobalState.check_move_bounds = TRUE; switch(which){ case 'e': GlobalState.lower_move_bound = value; GlobalState.upper_move_bound = value; break; case 'l': if(value <= GlobalState.upper_move_bound){ GlobalState.lower_move_bound = value; } else{ fprintf(GlobalState.logfile, "Lower bound is greater than the upper bound; -%c ignored.\n", arg_letter); Ok = FALSE; } break; case 'u': if(value >= GlobalState.lower_move_bound){ GlobalState.upper_move_bound = value; } else{ fprintf(GlobalState.logfile, "Upper bound is smaller than the lower bound; -%c ignored.\n", arg_letter); Ok = FALSE; } break; } } else{ fprintf(GlobalState.logfile, "-%c should be in the form -%c[elu]number.\n", arg_letter,arg_letter); Ok = FALSE; } if(!Ok){ exit(1); } } break; case GAMES_PER_FILE_ARGUMENT: if(GlobalState.ECO_level > 0){ fprintf(GlobalState.logfile, "-%c conflicts with -E.\n",arg_letter); exit(1); } else if(GlobalState.output_filename != NULL){ fprintf(GlobalState.logfile, "-%c: File %s has already been selected for output.\n", arg_letter, GlobalState.output_filename); exit(1); } else if(sscanf(associated_value,"%u", &GlobalState.games_per_file) != 1){ fprintf(GlobalState.logfile, "-%c should be followed by an unsigned integer.\n", arg_letter); exit(1); } else{ /* Value set. */ } break; case FILE_OF_ARGUMENTS_ARGUMENT: if(*filename != '\0'){ /* @@@ Potentially recursive call. Is this safe? */ read_args_file(filename); } else{ fprintf(GlobalState.logfile,"Usage: -%cfilename.\n", arg_letter); } break; case NON_MATCHING_GAMES_ARGUMENT: if(*filename != '\0'){ if(GlobalState.non_matching_file != NULL){ (void) fclose(GlobalState.non_matching_file); } GlobalState.non_matching_file = MustOpen(filename,"w"); } else{ fprintf(GlobalState.logfile,"Usage: -%cfilename.\n",arg_letter); exit(1); } break; case TAG_EXTRACTION_ARGUMENT: /* A single tag extraction criterion. */ extract_tag_argument(associated_value); break; case LINE_WIDTH_ARGUMENT: { /* Specify an output line width. */ unsigned length; if(sscanf(associated_value,"%u",&length) > 0){ GlobalState.max_line_length = length; } else{ fprintf(GlobalState.logfile, "-%c should be followed by an unsigned integer.\n", arg_letter); exit(1); } } break; case HELP_ARGUMENT: { /* Determine at which level help is required. */ unsigned level = 0; (void) sscanf(associated_value,"%u",&level); usage_and_exit(level); } break; case OUTPUT_FORMAT_ARGUMENT: /* Whether to use the source form of moves or * rewrite them into another format. */ GlobalState.output_format = which_output_format(associated_value); break; case SEVEN_TAG_ROSTER_ARGUMENT: GlobalState.seven_tag_roster = TRUE; break; case DONT_KEEP_COMMENTS_ARGUMENT: GlobalState.keep_comments = FALSE; break; case DONT_KEEP_DUPLICATES_ARGUMENT: /* Make sure that this doesn't clash with -d. */ if(GlobalState.duplicate_file == NULL){ GlobalState.suppress_duplicates = TRUE; } else{ fprintf(GlobalState.logfile, "-%c clashes with -%c flag.\n", DONT_KEEP_DUPLICATES_ARGUMENT, DUPLICATES_FILE_ARGUMENT); exit(1); } break; case DONT_MATCH_PERMUTATIONS_ARGUMENT: GlobalState.match_permutations = FALSE; break; case DONT_KEEP_NAGS_ARGUMENT: GlobalState.keep_NAGs = FALSE; break; case OUTPUT_FEN_STRING_ARGUMENT: /* Output a FEN string of the final position. * This is displayed in a comment. */ GlobalState.output_FEN_string = TRUE; break; case CHECK_ONLY_ARGUMENT: /* Report errors, but don't convert. */ GlobalState.check_only = TRUE; break; case KEEP_SILENT_ARGUMENT: /* Turn off progress reporting. */ GlobalState.verbose = FALSE; break; case USE_SOUNDEX_ARGUMENT: /* Use soundex matches for player tags. */ GlobalState.use_soundex = TRUE; break; case SUPPRESS_ORIGINALS_ARGUMENT: GlobalState.suppress_originals = TRUE; break; case DONT_KEEP_VARIATIONS_ARGUMENT: GlobalState.keep_variations = FALSE; break; case USE_VIRTUAL_HASH_TABLE_ARGUMENT: GlobalState.use_virtual_hash_table = TRUE; break; case TAGS_ARGUMENT: if(*filename != '\0'){ read_tag_file(filename); } break; case TAG_ROSTER_ARGUMENT: if(*filename != '\0'){ read_tag_roster_file(filename); } break; case MOVES_ARGUMENT: if(*filename != '\0'){ /* Where the list of variations of interest are kept. */ FILE *variation_file = MustOpen(filename,"r"); /* We wish to search for particular variations. */ add_textual_variations_from_file(variation_file); fclose(variation_file); } break; case POSITIONS_ARGUMENT: if(*filename != '\0'){ FILE *variation_file = MustOpen(filename,"r"); /* We wish to search for positional variations. */ add_positional_variations_from_file(variation_file); fclose(variation_file); } break; case ENDINGS_ARGUMENT: if(*filename != '\0'){ if(!build_endings(filename)){ exit(1); } } break; } }