/* greml.c - see http://www.caressa.it/greml for more information. */ /* greml - A simple markup processor to produce html ancient greek texts. Copyright 2015 by Paolo Caressa . This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #define VERSION "1.0" #define PROGNAME "Greml" #define AUTHOR "Paolo Caressa " #define DEBUG 1 /* Standard idiom to compute the size of an array. */ #define SIZE(a) (sizeof(a)/sizeof((a)[0])) /* Global variables. */ char g_escape = '$'; /* Escape character. */ FILE *g_in = NULL; /* Current input file. */ FILE *g_out = NULL; /* Current output file. */ char *g_name = NULL; /* Name of the current input file. */ int g_line = 0; /* Line currently under parsing. */ int g_col = 0; /* Index of the currently scanned character in the current line. */ int g_dq_opened = 0; /* Flag true if a left double quote has been emitted. */ int g_acute = 0; /* Flag set when '/' is parsed. */ int g_circumflex = 0; /* Flag set when '^' is parsed. */ int g_diaeresis = 0; /* Flag set when '=' is parsed. */ int g_grave = 0; /* Flag set when '\\' is parsed. */ int g_iota = 0; /* Flag set when '|' is parsed. */ int g_rough = 0; /* Flag set when '<' is parsed. */ int g_smooth = 0; /* Flag set when '>' is parsed. */ /* Error function: if cond is 1 then prints a message (same syntax as printf) and abort execution. If cond is 0 then does nothing. */ void error_on(int cond, char *fmt, ...) { if ( cond ) { va_list al; va_start(al, fmt); fprintf(stderr, "%s:%i:%i ", g_name, g_line, g_col); vfprintf(stderr, fmt, al); va_end(al); exit(EXIT_FAILURE); } } /* Wraps fgetc so to adjust line and column pointer. */ int fget_char(void) { int c = fgetc(g_in); ++ g_col; if ( c == '\n' ) { ++ g_line; g_col = 1; } return c; } /* Routines used to implement actions triggered by certain characters. */ void do_acute(void) { g_acute = 1; } void do_circumflex(void) { g_circumflex = 1; } void do_diaeresis(void) { g_diaeresis = 1; } void do_grave(void) { g_grave = 1; } void do_iota(void) { g_iota = 1; } void do_rough(void) { g_rough = 1; } void do_smooth(void) { g_smooth = 1; } void do_closed_brace(void) { /* It may be either a single '}' or a double '}}'. */ int c = fget_char(); if ( c != '}' ) { /* Nope: it is a single '}'. */ fputc('}', g_out); ungetc(c, g_in); } else { /* Emits an acute closing parenthesis. */ fputs("〉", g_out); } } void do_closed_bracket(void) { /* It may be either a single ']' or a double ']]'. */ int c = fget_char(); if ( c != ']' ) { /* Nope: it is a single ']'. */ fputc(']', g_out); ungetc(c, g_in); } else { /* Emits a double ']'. */ fputs("〛", g_out); } } void do_dquote(void) { fprintf(g_out, g_dq_opened ? "”" : "“"); g_dq_opened = !g_dq_opened; } void do_hyphen(void) { /* It may be either a single '-' or a double '--'. */ int c = fget_char(); if ( c != '-' ) { /* Nope: it is a single '-'. */ fputc('-', g_out); ungetc(c, g_in); } else { /* Emits long hyphen. */ fputs("–", g_out); } } void do_opened_brace(void) { /* It may be either a single '{' or a double '{{'. */ int c = fget_char(); if ( c != '{' ) { /* Nope: it is a single '{'. */ fputc('{', g_out); ungetc(c, g_in); } else { /* Emits an acute opening parenthesis. */ fputs("〈", g_out); } } void do_opened_bracket(void) { /* It may be either a single '[' or a double '[['. */ int c = fget_char(); if ( c != '[' ) { /* Nope: it is a single '['. */ fputc('[', g_out); ungetc(c, g_in); } else { /* Emits a double '['. */ fputs("〚", g_out); } } void do_sigma(void) { /* Takes special care about ending sigma: if the next character is not a letter nor a diacritic or bracket symbol (which may appear inside words) then the final form of the sigma is printed on the out file. */ int c = fget_char(); fprintf(g_out, !isalpha(c) && strchr("<>\\/^=|[]{}", c) == NULL ? "ς" : "σ"); ungetc(c, g_in); } /* Table of "character-actions": keys are characters that, when parsed, cause the execution of the corresponding function. */ const struct { int c; void (*fun)(void); } CHR_ACTION[] = { {'"', do_dquote}, {'-', do_hyphen}, {'/', do_acute}, {'<', do_rough}, {'=', do_diaeresis}, {'>', do_smooth}, {'[', do_opened_bracket}, {'\\', do_grave}, {']', do_closed_bracket}, {'^', do_circumflex}, {'s', do_sigma}, {'{', do_opened_brace}, {'|', do_iota}, {'}', do_closed_brace}, }; /* Elements of a set of diacritics. */ #define SMOOTH (1) #define ROUGH (2) #define ACUTE (4) #define GRAVE (8) #define CIRCUMFLEX (16) #define DIAERESIS (32) #define IOTA (64) /* Table of letters with diacritic signs: each entry correspond to a unicode code, and it is identified via the combination of diacritics (obtained by mixing flags) and the corresponding letter. */ const struct { int c; int diacritic; char *value; } DIACRITICS[] = { {'a', SMOOTH, "ἀ"}, {'a', SMOOTH|GRAVE, "ἂ"}, {'a', SMOOTH|ACUTE, "ἄ"}, {'a', SMOOTH|CIRCUMFLEX, "ἆ"}, {'a', ROUGH, "ἁ"}, {'a', ROUGH|GRAVE, "ἃ"}, {'a', ROUGH|ACUTE, "ἅ"}, {'a', ROUGH|CIRCUMFLEX, "ἇ"}, {'a', IOTA|GRAVE, "ᾲ"}, {'a', IOTA, "ᾳ"}, {'a', IOTA|ACUTE, "ᾴ"}, {'a', CIRCUMFLEX, "ᾶ"}, {'a', IOTA|CIRCUMFLEX, "ᾷ"}, {'a', IOTA|SMOOTH, "ᾀ"}, {'a', IOTA|SMOOTH|GRAVE, "ᾂ"}, {'a', IOTA|SMOOTH|ACUTE, "ᾄ"}, {'a', IOTA|SMOOTH|CIRCUMFLEX, "ᾆ"}, {'a', IOTA|ROUGH, "ᾁ"}, {'a', IOTA|ROUGH|GRAVE, "ᾃ"}, {'a', IOTA|ROUGH|ACUTE, "ᾅ"}, {'a', IOTA|ROUGH|CIRCUMFLEX, "ᾇ"}, {'A', SMOOTH, "Ἀ"}, {'A', SMOOTH|GRAVE, "Ἂ"}, {'A', SMOOTH|ACUTE, "Ἄ"}, {'A', SMOOTH|CIRCUMFLEX, "Ἆ"}, {'A', ROUGH, "Ἁ"}, {'A', ROUGH|GRAVE, "Ἃ"}, {'A', ROUGH|ACUTE, "Ἅ"}, {'A', ROUGH|CIRCUMFLEX, "Ἇ"}, {'A', IOTA|SMOOTH, "ᾈ"}, {'A', IOTA|SMOOTH|GRAVE, "ᾊ"}, {'A', IOTA|SMOOTH|ACUTE, "ᾌ"}, {'A', IOTA|SMOOTH|CIRCUMFLEX, "ᾎ"}, {'A', IOTA|ROUGH, "ᾉ"}, {'A', IOTA|ROUGH|GRAVE, "ᾋ"}, {'A', IOTA|ROUGH|ACUTE, "ᾍ"}, {'A', IOTA|ROUGH|CIRCUMFLEX, "ᾏ"}, {'A', GRAVE, "Ὰ"}, {'A', ACUTE, "Ά"}, {'A', IOTA, "ᾼ"}, {'e', SMOOTH, "ἐ"}, {'e', SMOOTH|GRAVE, "ἒ"}, {'e', SMOOTH|ACUTE, "ἔ"}, {'e', ROUGH, "ἑ"}, {'e', ROUGH|GRAVE, "ἓ"}, {'e', ROUGH|ACUTE, "ἕ"}, {'E', SMOOTH, "Ἐ"}, {'E', SMOOTH|GRAVE, "Ἒ"}, {'E', SMOOTH|ACUTE, "Ἔ"}, {'E', ROUGH, "Ἑ"}, {'E', ROUGH|GRAVE, "Ἓ"}, {'E', ROUGH|ACUTE, "Ἕ"}, {'h', SMOOTH, "ἠ"}, {'h', SMOOTH|GRAVE, "ἢ"}, {'h', SMOOTH|ACUTE, "ἤ"}, {'h', SMOOTH|CIRCUMFLEX, "ἦ"}, {'h', ROUGH, "ἡ"}, {'h', ROUGH|GRAVE, "ἣ"}, {'h', ROUGH|ACUTE, "ἥ"}, {'h', ROUGH|CIRCUMFLEX, "ἧ"}, {'H', SMOOTH, "Ἠ"}, {'H', SMOOTH|GRAVE, "Ἢ"}, {'H', SMOOTH|ACUTE, "Ἤ"}, {'H', SMOOTH|CIRCUMFLEX, "Ἦ"}, {'H', ROUGH, "Ἡ"}, {'H', ROUGH|GRAVE, "Ἣ"}, {'H', ROUGH|ACUTE, "Ἥ"}, {'H', ROUGH|CIRCUMFLEX, "Ἧ"}, {'i', SMOOTH, "ἰ"}, {'i', SMOOTH|GRAVE, "ἲ"}, {'i', SMOOTH|ACUTE, "ἴ"}, {'i', SMOOTH|CIRCUMFLEX, "ἶ"}, {'i', ROUGH, "ἱ"}, {'i', ROUGH|GRAVE, "ἳ"}, {'i', ROUGH|ACUTE, "ἵ"}, {'i', ROUGH|CIRCUMFLEX, "ἷ"}, {'I', SMOOTH, "Ἰ"}, {'I', SMOOTH|GRAVE, "Ἲ"}, {'I', SMOOTH|ACUTE, "Ἴ"}, {'I', SMOOTH|CIRCUMFLEX, "Ἶ"}, {'I', ROUGH, "Ἱ"}, {'I', ROUGH|GRAVE, "Ἳ"}, {'I', ROUGH|ACUTE, "Ἵ"}, {'I', ROUGH|CIRCUMFLEX, "Ἷ"}, {'I', CIRCUMFLEX, "Ϊ"}, {'o', SMOOTH, "ὀ"}, {'o', SMOOTH|GRAVE, "ὂ"}, {'o', SMOOTH|ACUTE, "ὄ"}, {'o', ROUGH, "ὁ"}, {'o', ROUGH|GRAVE, "ὃ"}, {'o', ROUGH|ACUTE, "ὅ"}, {'O', SMOOTH, "Ὀ"}, {'O', SMOOTH|GRAVE, "Ὂ"}, {'O', SMOOTH|ACUTE, "Ὄ"}, {'O', ROUGH, "Ὁ"}, {'O', ROUGH|GRAVE, "Ὃ"}, {'O', ROUGH|ACUTE, "Ὅ"}, {'u', SMOOTH, "ὐ"}, {'u', SMOOTH|GRAVE, "ὒ"}, {'u', SMOOTH|ACUTE, "ὔ"}, {'u', SMOOTH|CIRCUMFLEX, "ὖ"}, {'u', ROUGH, "ὑ"}, {'u', ROUGH|GRAVE, "ὓ"}, {'u', ROUGH|ACUTE, "ὕ"}, {'u', ROUGH|CIRCUMFLEX, "ὗ"}, {'U', ROUGH, "Ὑ"}, {'U', ROUGH|GRAVE, "Ὓ"}, {'U', ROUGH|ACUTE, "Ὕ"}, {'U', ROUGH|CIRCUMFLEX, "Ὗ"}, {'U', CIRCUMFLEX, "Ϋ"}, {'w', SMOOTH, "ὠ"}, {'w', SMOOTH|GRAVE, "ὢ"}, {'w', SMOOTH|ACUTE, "ὤ"}, {'w', SMOOTH|CIRCUMFLEX, "ὦ"}, {'w', ROUGH, "ὡ"}, {'w', ROUGH|GRAVE, "ὣ"}, {'w', ROUGH|ACUTE, "ὥ"}, {'w', ROUGH|CIRCUMFLEX, "ὧ"}, {'W', SMOOTH, "Ὠ"}, {'W', SMOOTH|GRAVE, "Ὢ"}, {'W', SMOOTH|ACUTE, "Ὤ"}, {'W', SMOOTH|CIRCUMFLEX, "Ὦ"}, {'W', ROUGH, "Ὡ"}, {'W', ROUGH|GRAVE, "Ὣ"}, {'W', ROUGH|ACUTE, "Ὥ"}, {'W', ROUGH|CIRCUMFLEX, "Ὧ"}, {'a', GRAVE, "ὰ"}, {'e', GRAVE, "ὲ"}, {'h', GRAVE, "ὴ"}, {'i', GRAVE, "ὶ"}, {'o', GRAVE, "ὸ"}, {'u', GRAVE, "ὺ"}, {'w', GRAVE, "ὼ"}, {'a', ACUTE, "ά"}, {'e', ACUTE, "έ"}, {'h', ACUTE, "ή"}, {'i', ACUTE, "ί"}, {'o', ACUTE, "ό"}, {'u', ACUTE, "ύ"}, {'w', ACUTE, "ώ"}, {'h', IOTA|SMOOTH, "ᾐ"}, {'h', IOTA|SMOOTH|GRAVE, "ᾒ"}, {'h', IOTA|SMOOTH|ACUTE, "ᾔ"}, {'h', IOTA|SMOOTH|CIRCUMFLEX, "ᾖ"}, {'h', IOTA|ROUGH, "ᾑ"}, {'h', IOTA|ROUGH|GRAVE, "ᾓ"}, {'h', IOTA|ROUGH|ACUTE, "ᾕ"}, {'h', IOTA|ROUGH|CIRCUMFLEX, "ᾗ"}, {'H', IOTA|SMOOTH, "ᾘ"}, {'H', IOTA|SMOOTH|GRAVE, "ᾚ"}, {'H', IOTA|SMOOTH|ACUTE, "ᾜ"}, {'H', IOTA|SMOOTH|CIRCUMFLEX, "ᾞ"}, {'H', IOTA|ROUGH, "ᾙ"}, {'H', IOTA|ROUGH|GRAVE, "ᾛ"}, {'H', IOTA|ROUGH|ACUTE, "ᾝ"}, {'H', IOTA|ROUGH|CIRCUMFLEX, "ᾟ"}, {'w', IOTA|SMOOTH, "ᾠ"}, {'w', IOTA|SMOOTH|GRAVE, "ᾢ"}, {'w', IOTA|SMOOTH|ACUTE, "ᾤ"}, {'w', IOTA|SMOOTH|CIRCUMFLEX, "ᾦ"}, {'w', IOTA|ROUGH, "ᾡ"}, {'w', IOTA|ROUGH|GRAVE, "ᾣ"}, {'w', IOTA|ROUGH|ACUTE, "ᾥ"}, {'w', IOTA|ROUGH|CIRCUMFLEX, "ᾧ"}, {'W', IOTA|SMOOTH, "ᾨ"}, {'W', IOTA|SMOOTH|GRAVE, "ᾪ"}, {'W', IOTA|SMOOTH|ACUTE, "ᾬ"}, {'W', IOTA|SMOOTH|CIRCUMFLEX, "ᾮ"}, {'W', IOTA|ROUGH, "ᾩ"}, {'W', IOTA|ROUGH|GRAVE, "ᾫ"}, {'W', IOTA|ROUGH|ACUTE, "ᾭ"}, {'W', IOTA|ROUGH|CIRCUMFLEX, "ᾯ"}, {'h', IOTA|GRAVE, "ῂ"}, {'h', IOTA, "ῃ"}, {'h', IOTA|ACUTE, "ῄ"}, {'h', CIRCUMFLEX, "ῆ"}, {'h', IOTA|CIRCUMFLEX, "ῇ"}, {'w', IOTA|GRAVE, "ῲ"}, {'w', IOTA, "ῳ"}, {'w', IOTA|ACUTE, "ῴ"}, {'w', CIRCUMFLEX, "ῶ"}, {'w', IOTA|CIRCUMFLEX, "ῷ"}, {'H', GRAVE, "Ὴ"}, {'H', ACUTE, "Ή"}, {'H', IOTA, "ῌ"}, {'W', GRAVE, "Ὼ"}, {'W', ACUTE, "Ώ"}, {'W', IOTA, "ῼ"}, {'E', GRAVE, "Ὲ"}, {'E', ACUTE, "Έ"}, {'i', DIAERESIS, "ϊ"}, {'i', GRAVE|DIAERESIS, "Ῐ"}, {'i', ACUTE|DIAERESIS, "Ῑ"}, {'i', CIRCUMFLEX, "ῖ"}, {'i', CIRCUMFLEX|DIAERESIS, "ῗ"}, {'I', GRAVE, "Ὶ"}, {'I', ACUTE, "Ί"}, {'u', DIAERESIS, "ϋ"}, {'u', GRAVE|DIAERESIS, "Ῠ"}, {'u', ACUTE|DIAERESIS, "Ῡ"}, {'u', CIRCUMFLEX, "ῦ"}, {'u', CIRCUMFLEX|DIAERESIS, "ῧ"}, {'U', GRAVE, "Ὺ"}, {'U', ACUTE, "Ύ"}, {'O', GRAVE, "Ὸ"}, {'O', ACUTE, "Ό"}, {'r', SMOOTH, "ῤ"}, {'r', ROUGH, "ῥ"}, {'R', ROUGH, "Ῥ"}, }; /* Returns the address of a string which contains the currently activated diacritic signs according to the parameters. */ char *print_diacritics(int c) { static char buffer[BUFSIZ]; sprintf(buffer, "%c with ", c); if ( g_rough ) strcat(buffer, "rough "); if ( g_smooth ) strcat(buffer, "smooth "); if ( g_acute ) strcat(buffer, "acute "); if ( g_grave ) strcat(buffer, "grave "); if ( g_circumflex ) strcat(buffer, "circumflex "); if ( g_diaeresis ) strcat(buffer, "diaeresis "); if ( g_iota ) strcat(buffer, "iota subscript"); return buffer; } /* Table of "character-string": this is a 96 long array of strings (runnning from ASCII code 0x20 (the 0th element) to ASCII code 0x7F (the 96th element), where at the i-th element is stored the string to substitute for the character of ASCII encoding i - 0x20, or NULL if no substitution has to be done. */ const char *CHR_TRANS[96] = { NULL, NULL, NULL, NULL, NULL, NULL, "καὶ", "᾿", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "·", NULL, NULL, NULL, NULL, NULL, "Α", "Β", "Χ", "Δ", "Ε", "Φ", "Γ", "Η", "Ι", "Ϲ", "Κ", "Λ", "Μ", "Ν", "Ο", "Π", "Θ", "Ρ", "Σ", "Τ", "Υ", "Ϝ", "Ω", "Ξ", "Ψ", "Ζ", NULL, NULL, NULL, NULL, NULL, "῾", "α", "β", "χ", "δ", "ε", "φ", "γ", "η", "ι", "ς", "κ", "λ", "μ", "ν", "ο", "π", "θ", "ρ", NULL, "τ", "υ", "ϝ", "ω", "ξ", "ψ", "ζ", NULL, NULL, NULL, NULL, NULL, }; /* Translates the content of a greek region, until che end of file or the next escape character is found. */ void greml_transliterate(void) { int c, i; /* Resets diacritic flags. */ g_acute = 0; g_circumflex = 0; g_diaeresis = 0; g_grave = 0; g_iota = 0; g_rough = 0; g_smooth = 0; /* Scanning loop. */ while ( (c = fget_char()) != EOF && c != g_escape ) { int breathing = g_rough + g_smooth; int accent = g_acute + g_circumflex + g_grave; int other = g_diaeresis + g_iota; /* Check against diacritics coherence. */ error_on( breathing > 1, "Cannot mix different breathings: %s", print_diacritics(c)); error_on( accent > 1, "Cannot mix different accents: %s", print_diacritics(c)); error_on( breathing + accent + other > 3, "More than three diacritics together: %s", print_diacritics(c)); /* Take care of diacritics... */ if ( breathing + accent + other > 0 && isalpha(c) ) { /* Produces the bit string corresponding to diacritic flags. */ int flag = g_rough * ROUGH + g_smooth * SMOOTH + g_acute * ACUTE + g_grave * GRAVE + g_circumflex * CIRCUMFLEX + g_diaeresis * DIAERESIS + g_iota * IOTA; for ( i = 0; i < SIZE(DIACRITICS); ++ i ) { if ( DIACRITICS[i].c == c && DIACRITICS[i].diacritic == flag ) { fprintf(g_out, DIACRITICS[i].value); g_rough = g_smooth = g_acute = g_grave = g_circumflex = g_diaeresis = g_iota = 0; break; /* actually will continue the outer while loop. */ } } error_on(i == SIZE(DIACRITICS), "Invalid diacritic combination: %s", print_diacritics(c)); continue; /* the outer while loop. */ } /* Here there aren't diacritics, so any kind of character is expected. */ /* Check against an "alias characters". */ if ( c >= 0x20 && c < 0x80 && CHR_TRANS[c - 0x20] != NULL ) { fputs(CHR_TRANS[c - 0x20], g_out); continue; /* the outer while loop. */ } /* Check against an "executable character". */ for ( i = 0; i < SIZE(CHR_ACTION); ++ i ) { if ( CHR_ACTION[i].c == c ) { (*CHR_ACTION[i].fun)(); goto Continue; /* continue the outer while loop. */ } } /* Here c is a bare character to print! */ fputc(c, g_out); Continue: ; } } /* Scans the inname file and dumps on the outname file, by transliterating to Greek inside "greek regions", delimited by g_escape. */ void greml(char *inname, char *outname) { int c; g_in = fopen(inname, "r"); g_out = fopen(outname, "w"); error_on(g_in == NULL, "Cannot open file %s for reading", inname); error_on(g_out == NULL, "Cannot open file %s for writing", outname); g_line = 1; g_col = 1; g_name = inname; while ( (c = fget_char()) != EOF ) { ++ g_col; if ( c == '\n' ) { ++ g_line; g_col = 1; } if ( c != g_escape ) { fputc(c, g_out); } else { c = fget_char(); if ( c == g_escape ) { /* A repeated escape character dumps itself on the output file. */ fputc(g_escape, g_out); } else { ungetc(c, g_in); greml_transliterate(); } } } fclose(g_out); fclose(g_in); } void finally(void) { char buf[sizeof(int)]; fprintf(stderr, "\nPress to terminate..."); fgets(buf, sizeof(buf), stdin); } const char USAGE[] = "USAGE: greml [options] file1 ... filen\n" "Options:\n" " -e c Sets the escape character to c\n"; int main(int argc, char **argv) { int i; char *s; char out_name[FILENAME_MAX]; atexit(finally); for ( i = 1; i < argc; ++ i ) { if ( argv[i][0] == '-' && argv[i][0] == '\0' ) { switch( argv[i][1] ) { case 'e': /* Option "-e character", */ if ( i < argc - 1 ) { ++ i; g_escape = argv[i][0]; continue; } /* Fall through!!! */ default: fputs(USAGE, stderr); exit(EXIT_FAILURE); } } else { /* Creates the name of the output file. */ strncpy(out_name, argv[i], FILENAME_MAX); s = strrchr(out_name, '.'); if ( s != NULL ) { /* Drops the originary extension. */ *s = '\0'; } strncat(out_name, ".html", FILENAME_MAX); greml(argv[i], out_name); } } return EXIT_SUCCESS; }