~widelands-dev/widelands/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
#!/usr/bin/env python3
# encoding: utf-8

"""Runs a glossary check on all po files and writes the check results to
po_validation/glossary.

You will need to have the Translate Toolkit installed in order for the checks to work:
http://toolkit.translatehouse.org/

This script also uses hunspell to reduce the number of false positive hits, so
install as many of the needed hunspell dictionaries as you can find. This script
will inform you about missing hunspell locales.

For Debian-based Linux: sudo apt-get install translate-toolkit hunspell hunspell-ar hunspell-bg hunspell-br hunspell-ca hunspell-cs hunspell-da hunspell-de-de hunspell-el hunspell-en-ca hunspell-en-gb hunspell-en-us hunspell-eu hunspell-fr hunspell-gd hunspell-gl hunspell-he hunspell-hr hunspell-hu hunspell-it hunspell-ko hunspell-lt hunspell-nl hunspell-no hunspell-pl hunspell-pt-br hunspell-pt-pt hunspell-ro hunspell-ru hunspell-si hunspell-sk hunspell-sl hunspell-sr hunspell-sv hunspell-uk hunspell-vi

You will need to provide an export of the Transifex glossary and specify it at
the command line. Make sure to select "Include glossary notes in file" when
exporting the csv from Transifex.

Translators can 'misuse' their languages' comment field on Transifex to add
inflected forms of their glossary translations. We use the delimiter '|' to
signal that the field has inflected forms in it. Examples:

Source    Translation    Comment             Translation will be matched against
------    -----------    ----------------    -----------------------------------
sheep     sheep          Nice, fluffy!       'sheep'
ax        axe            axes|               'axe', 'axes'
click     click          clicking|clicked    'click', 'clicking', 'clicked'
click     click          clicking | clicked  'click', 'clicking', 'clicked'

"""

import os.path
import re
import subprocess
import sys
import time
import traceback
from file_utils import read_csv_file, make_path, delete_path

#############################################################################
# Data Containers                                                           #
#############################################################################


class GlossaryEntry:
    """An entry in our parsed glossaries."""

    def __init__(self):
        # Base form of the term, followed by any inflected forms
        self.terms = []
        # Base form of the translation, followed by any inflected forms
        self.translations = []


class FailedTranslation:
    """Information about a translation that failed a check."""

    def __init__(self):
        # The locale where the check failed
        self.locale = ''
        # The po file containing the failed translation
        self.po_file = ''
        # Source text
        self.source = ''
        # Target text
        self.target = ''
        # Location in the source code
        self.location = ''
        # The glossary term that failed the check
        self.term = ''
        # The base form of the translated glossary term
        self.translation = ''


PREFERRED_HUNSPELL_LOCALES = {
    'ar': 'ar_EG',
    'ca': 'ca_ES',
    'de': 'de_DE',
    'en': 'en_US',
    'es': 'es_ES',
    'eu': 'eu_ES',
    'fr': 'fr_FR',
    'it': 'it_IT',
    'kk': 'kk_KZ',
    'ko': 'ko_KR',
    'nl': 'nl_NL',
    'pt': 'pt_PT',
    'sr': 'sr_Latn_RS',
    'sv': 'sv_SE',
    'sw': 'sw_KE',
}

HUNSPELL_LOCALES = {}

""" Hunspell needs country-specific locales"""


#############################################################################
# Glossary Loading                                                          #
#############################################################################

def get_hunspell_locale(locale):
    """Returns the corresponding Hunspell locale for this locale, or empty
    string if not available."""
    if locale in HUNSPELL_LOCALES.keys():
        return HUNSPELL_LOCALES[locale]
    return ''


def load_hunspell_locales():
    """Registers locales for Hunspell.

    Maps a list of generic locales to specific locales from available
    Hunspell dictionaries.

    """

    try:
        hunspell_result = subprocess.run(
            ['hunspell', '-D'], capture_output=True, text=True, check=True)
        hunspell_result.check_returncode()

        # Track added locales so that we can print out a hint to add to preferred list if we have many
        new_hunspell_locales = {}

        # Dictionary information is printed to stderr
        for line in sorted(hunspell_result.stderr.split('\n')):
            # Discard text output that is not a file prefix
            if os.path.isfile(line + '.dic') or os.path.isfile(line + '.aff'):
                full_locale = os.path.basename(line)
                language = full_locale.split('_')[0]

                if language in PREFERRED_HUNSPELL_LOCALES.keys():
                    if PREFERRED_HUNSPELL_LOCALES[language] == full_locale:
                        print('Found Hunspell locale:',
                              language, ':', full_locale)
                        new_hunspell_locales[language] = full_locale
                        HUNSPELL_LOCALES[language] = full_locale
                else:
                    if language in new_hunspell_locales.keys():
                        print('WARNING: Multiple countries found for', language,
                              ':', full_locale, ' - ', new_hunspell_locales[language])
                    if language not in HUNSPELL_LOCALES:
                        print('Found Hunspell locale:',
                              language, ':', full_locale)
                        new_hunspell_locales[language] = full_locale
                        HUNSPELL_LOCALES[language] = full_locale
                        HUNSPELL_LOCALES[full_locale] = full_locale

    except subprocess.CalledProcessError:
        print('Failed to run hunspell. Error code:',
              hunspell_result.returncode, '\n', hunspell_result.stderr)


def is_vowel(character):
    """Helper function for creating inflections of English words."""
    return character in ('a', 'e', 'i', 'o', 'u', 'y')


def make_english_plural(word):
    """Create plural forms for nouns.

    This will create a few nonsense entries for irregular plurals, but
    it's good enough for our purpose. Glossary contains pluralized
    terms, so we don't add any plural forms for strings ending in 's'.

    """
    result = ''
    if not word.endswith('s'):
        if word.endswith('y') and not is_vowel(word[-2:-1]):
            result = word[0:-1] + 'ies'
        elif word.endswith('z') or word.endswith('x') or word.endswith('ch') or word.endswith('sh') or word.endswith('o'):
            result = word + 'es'
        else:
            result = word + 's'
    return result


def make_english_verb_forms(word):
    """Create inflected forms of an English verb: -ed and -ing forms.

    Will create nonsense for irregular verbs.
    """
    result = []
    if word.endswith('e'):
        result.append(word[0:-1] + 'ing')
        result.append(word + 'd')
    elif is_vowel(word[-2:-1]) and not is_vowel(word[-1]):
        # The consonant is duplicated here if the last syllable is stressed.
        # We can't detect stress, so we add both variants.
        result.append(word + word[-1] + 'ing')
        result.append(word + 'ing')
        result.append(word + word[-1] + 'ed')
        result.append(word + 'ed')
    elif word.endswith('y') and not is_vowel(word[-2:-1]):
        result.append(word + 'ing')
        result.append(word[0:-1] + 'ed')
    else:
        result.append(word + 'ing')
        result.append(word + 'ed')
    # 3rd person s has the same pattern as noun plurals.
    # We ommitted words ending on s i the plural, so we add them here.
    if word.endswith('s'):
        result.append(word + 'es')
    else:
        result.append(make_english_plural(word))
    return result


def load_glossary(glossary_file, locale):
    """Build a glossary from the given Transifex glossary csv file for the
    given locale."""
    result = []
    counter = 0
    term_index = 0
    term_comment_index = 0
    wordclass_index = 0
    translation_index = 0
    comment_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment' or header == 'notes':
                    term_comment_index = colum_counter
                elif header == 'pos':
                    wordclass_index = colum_counter
                elif header in ('translation_' + locale, locale):
                    translation_index = colum_counter
                elif header == 'comment_' + locale or header == 'notes_' + locale:
                    comment_index = colum_counter
                colum_counter = colum_counter + 1
        # If there is a translation, parse the entry
        # We also have some obsolete terms in the glossary that we want to
        # filter out.
        elif len(row[translation_index].strip()) > 0 and not row[term_comment_index].startswith('OBSOLETE'):
            if translation_index == 0:
                raise Exception(
                    'Locale %s is missing from glossary file.' % locale)
            if comment_index == 0:
                raise Exception(
                    'Comment field for locale %s is missing from glossary file.' % locale)
            entry = GlossaryEntry()
            entry.terms.append(row[term_index].strip())
            if row[wordclass_index] == 'Noun':
                plural = make_english_plural(entry.terms[0])
                if len(plural) > 0:
                    entry.terms.append(plural)
            elif row[wordclass_index] == 'Verb':
                verb_forms = make_english_verb_forms(entry.terms[0])
                for verb_form in verb_forms:
                    entry.terms.append(verb_form)

            entry.translations.append(row[translation_index].strip())

            # Misuse the comment field to provide a list of inflected forms.
            # Otherwise, we would get tons of false positive hits in the checks
            # later on and the translators would have our heads on a platter.
            delimiter = '|'
            if len(row[comment_index].strip()) > 1 and delimiter in row[comment_index]:
                inflections = row[comment_index].split(delimiter)
                for inflection in inflections:
                    entry.translations.append(inflection.strip())

            result.append(entry)
        counter = counter + 1
    return result


#############################################################################
# Term Checking                                                             #
#############################################################################


def contains_term(string, term):
    """Checks whether 'string' contains 'term' as a whole word.

    This check is case-insensitive.

    """
    result = False
    # Regex is slow, so we do this preliminary check
    if term.lower() in string.lower():
        # Now make sure that it's whole words!
        # We won't want to match "AI" against "again" etc.
        regex = re.compile(r'^|(.+\W)' + term + r'(\W.+)|$', re.IGNORECASE)
        result = regex.match(string)
    return result


def source_contains_term(source_to_check, entry, glossary):
    """Checks if the source string contains the glossary entry while filtering
    out superstrings from the glossary, e.g. we don't want to check 'arena'
    against 'battle arena'."""
    source_to_check = source_to_check.lower()
    for term in entry.terms:
        term = term.lower()
        if term in source_to_check:
            source_regex = re.compile(r'.+[\s,.]' + term + r'[\s,.].+')
            if source_regex.match(source_to_check):
                for entry2 in glossary:
                    if entry.terms[0] != entry2.terms[0]:
                        for term2 in entry2.terms:
                            term2 = term2.lower()
                            if term2 != term and term in term2 and term2 in source_to_check:
                                source_to_check = source_to_check.replace(
                                    term2, '')
                # Check if the source still contains the term to check
                return contains_term(source_to_check, term)
    return False


def append_hunspell_stems(hunspell_locale, translation):
    """ Use hunspell to append the stems for terms found = less work for glossary editors.
    The effectiveness of this check depends on how good the hunspell data is."""
    try:
        hunspell_result = subprocess.run(['hunspell', '-d', hunspell_locale,
                                          '-s'], input=translation, capture_output=True, text=True, check=True)
        hunspell_result.check_returncode()

        for line in hunspell_result.stdout.split('\n'):
            linedata = line.split(' ')
            if len(linedata) == 2:
                translation = ' '.join([translation, linedata[1]])

    except subprocess.CalledProcessError:
        print('Failed to run hunspell for locale: ' + hunspell_locale)
    return translation


def translation_has_term(entry, target):
    """Verify the target translation against all translation variations from
    the glossary."""
    result = False
    for translation in entry.translations:
        if contains_term(target, translation):
            result = True
            break
    return result


def check_file(csv_file, glossaries, locale, po_file):
    """Run the actual check."""
    translations = read_csv_file(csv_file)
    source_index = 0
    target_index = 0
    location_index = 0
    hits = []
    counter = 0
    hunspell_locale = get_hunspell_locale(locale)
    for row in translations:
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'source':
                    source_index = colum_counter
                elif header == 'target':
                    target_index = colum_counter
                elif header == 'location':
                    location_index = colum_counter
                colum_counter = colum_counter + 1
        else:
            for entry in glossaries[locale]:
                # Check if the source text contains the glossary term.
                # Filter out superstrings, e.g. we don't want to check
                # "arena" against "battle arena"
                if source_contains_term(row[source_index], entry, glossaries[locale]):
                    # Skip empty translations
                    if row[target_index] == '':
                        continue
                    # Now verify the translation against all translation
                    # variations from the glossary
                    term_found = translation_has_term(entry, row[target_index])
                    # Add Hunspell stems for better matches and try again
                    # We do it here because the Hunspell manipulation is slow.
                    if not term_found and hunspell_locale != '':
                        target_to_check = append_hunspell_stems(
                            hunspell_locale, row[target_index])
                        term_found = translation_has_term(
                            entry, target_to_check)
                    if not term_found:
                        hit = FailedTranslation()
                        hit.source = row[source_index]
                        hit.target = row[target_index]
                        hit.location = row[location_index]
                        hit.term = entry.terms[0]
                        hit.translation = entry.translations[0]
                        hit.locale = locale
                        hit.po_file = po_file
                        hits.append(hit)
        counter = counter + 1
    return hits


#############################################################################
# Main Loop                                                                 #
#############################################################################


def check_translations_with_glossary(input_path, output_path, glossary_file, only_locale):
    """Main loop.

    Loads the Transifex and Hunspell glossaries, converts all po files
    for languages that have glossary entries to temporary csv files,
    runs the check and then reports any hits to csv files.

    """
    print('Locale: ' + only_locale)
    temp_path = make_path(output_path, 'temp_glossary')
    hits = []
    locale_list = []

    glossaries = {}
    load_hunspell_locales()

    source_directories = sorted(os.listdir(input_path), key=str.lower)
    for dirname in source_directories:
        dirpath = os.path.join(input_path, dirname)
        if os.path.isdir(dirpath):
            source_files = sorted(os.listdir(dirpath), key=str.lower)
            sys.stdout.write("\nChecking text domain '" + dirname + "': ")
            sys.stdout.flush()
            for source_filename in source_files:
                po_file = dirpath + '/' + source_filename
                if source_filename.endswith('.po'):
                    locale = source_filename[0:-3]
                    if only_locale in ('all', locale):
                        # Load the glossary if we haven't seen this locale
                        # before
                        if locale not in glossaries:
                            sys.stdout.write(
                                '\nLoading glossary for ' + locale)
                            glossaries[locale] = load_glossary(
                                glossary_file, locale.lower())
                            sys.stdout.write(' - %d entries ' %
                                             len(glossaries[locale]))
                            sys.stdout.flush()
                        # Only bother with locales that have glossary entries
                        if len(glossaries[locale]) > 0:
                            sys.stdout.write(locale + ' ')
                            sys.stdout.flush()
                            if locale not in locale_list:
                                locale_list.append(locale)
                            csv_file = os.path.abspath(os.path.join(
                                temp_path, dirname + '_' + locale + '.csv'))
                            # Convert to csv for easy parsing
                            try:
                                subprocess.run(
                                    ['po2csv', '--progress=none', po_file, csv_file], check=True)

                                # Now run the actual check
                                current_hits = check_file(
                                    csv_file, glossaries, locale, dirname)
                                for hit in current_hits:
                                    hits.append(hit)
                            except subprocess.CalledProcessError:
                                print('Error with po2csv for:',
                                      po_file, csv_file)

                            # The csv file is no longer needed, delete it.
                            os.remove(csv_file)

    hits = sorted(hits, key=lambda FailedTranslation: [
                  FailedTranslation.locale, FailedTranslation.translation])
    for locale in locale_list:
        locale_result = '"glossary_term","glossary_translation","source","target","file","location"\n'
        counter = 0
        for hit in hits:
            if hit.locale == locale:
                row = '"%s","%s","%s","%s","%s","%s"\n' % (
                    hit.term, hit.translation, hit.source, hit.target, hit.po_file, hit.location)
                locale_result = locale_result + row
                counter = counter + 1
        dest_filepath = make_path(output_path, locale)
        with open(dest_filepath + '/glossary_check.csv', 'wt') as dest_file:
            dest_file.write(locale_result)
        # Uncomment this line to print a statistic of the number of hits for each locale
        # print("%s\t%d"%(locale, counter))

    delete_path(temp_path)
    return 0


def main():
    """Checks whether we are in the correct directory and everything's there,
    then runs a glossary check over all PO files."""
    if len(sys.argv) == 2 or len(sys.argv) == 3:
        print('Running glossary checks:')
    else:
        print(
            'Usage: glossary_checks.py <relative-path-to-glossary> [locale]')
        return 1

    try:
        print('Current time: %s' % time.ctime())
        # Prepare the paths
        glossary_file = os.path.abspath(os.path.join(
            os.path.dirname(__file__), sys.argv[1]))
        locale = 'all'
        if len(sys.argv) == 3:
            locale = sys.argv[2]

        if not (os.path.exists(glossary_file) and os.path.isfile(glossary_file)):
            print('There is no glossary file at ' + glossary_file)
            return 1

        input_path = os.path.abspath(os.path.join(
            os.path.dirname(__file__), '../data/i18n/translations'))
        output_path = make_path(os.path.dirname(
            __file__), '../po_validation/translators')
        result = check_translations_with_glossary(
            input_path, output_path, glossary_file, locale)
        print('Current time: %s' % time.ctime())
        return result

    except Exception:
        print('Something went wrong:')
        traceback.print_exc()
        delete_path(make_path(output_path, 'temp_glossary'))
        return 1


if __name__ == '__main__':
    sys.exit(main())