diff options
author | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2019-04-18 23:20:46 +0300 |
---|---|---|
committer | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2019-04-18 23:20:46 +0300 |
commit | 481c69be02c95fbf20567d8de617315484e2d692 (patch) | |
tree | 71bf406543586f278a5e779a0befa57f0fe529e9 | |
parent | 07d4fa1fe85eba554becb20bb4a872acb320a423 (diff) | |
download | openrussian-cli-481c69be02c95fbf20567d8de617315484e2d692.tar.gz |
major changes (unfortunately not done systematically)
* keep the openrussian-sql.zip to avoid bitrot (sudden breakage when building the
project against a recent DB)
* allow lookups against inflections.
A special table `bare_inflections` was added to the DB in order to speed this up.
* authentic autocompletions taking every parameter into account, using a magic `-C` parameter
* language selection via locale or -L<lang>
* translation lookup
* allow multiple search terms on the command line without escaping
* all SQL strings are properly escaped now to avoid (accidental) code insertion
* luautf8 is mandatory now and used much more, which is safer
than handling Unicode strings with the builtin functions
* ignore "disable" database entries
* added option -V (verbatim)
* more protections against empty fields
* print "DERIVED FROM", "AUDIO", "USAGE" and various word relation sections
* print word ranking and level (A1-C2...)
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | Makefile | 16 | ||||
-rw-r--r-- | openrussian-completion.bash | 37 | ||||
-rw-r--r-- | openrussian-sql.zip | bin | 0 -> 35607877 bytes | |||
-rwxr-xr-x | openrussian.lua | 505 | ||||
-rw-r--r-- | postprocess.sql | 78 |
6 files changed, 542 insertions, 97 deletions
@@ -1,3 +1,4 @@ openrussian -openrussian-sql.zip openrussian-sqlite3.db +# This file is currently versioned, so it should not be ignored. +#openrussian-sql.zip @@ -13,16 +13,20 @@ openrussian : openrussian.lua chmod a+x $@ # Marking the ZIP intermediate makes sure it is automatically deleted after -# generating the SQLITE database while we don't always re-download it. +# generating the SQLite database while we don't always re-download it. # Effectively, it will behave like a temporary file. -.INTERMEDIATE: openrussian-sql.zip +# NOTE: This is disabled for the time being since the database schema changes +# from time to time, so a rebuild could easily break the script. +# Instead, we add the file to Git, so every clone is guaranteed to contain +# a database matching the openrussian.lua script. +#.INTERMEDIATE: openrussian-sql.zip openrussian-sql.zip: wget -O $@ 'https://en.openrussian.org/downloads/openrussian-sql.zip' -# NOTE: VACUUMing the database saves a few megabytes -openrussian-sqlite3.db : openrussian-sql.zip mysql2sqlite +openrussian-sqlite3.db : openrussian-sql.zip mysql2sqlite postprocess.sql + $(RM) $@ unzip -p $< openrussian.sql | ./mysql2sqlite - | sqlite3 $@ - sqlite3 $@ VACUUM + sqlite3 $@ -batch <postprocess.sql # NOTE: Installation of the Bash completions depends on the Debain bash-completion # package being installed or something similar @@ -33,4 +37,4 @@ install : openrussian openrussian-sqlite3.db openrussian-completion.bash cp openrussian-completion.bash $(DESTDIR)$(COMPLETIONSDIR)/openrussian clean: - $(RM) -f openrussian openrussian-sql.zip openrussian-sqlite3.db + $(RM) openrussian openrussian-sql.zip openrussian-sqlite3.db diff --git a/openrussian-completion.bash b/openrussian-completion.bash index fd27c06..9892d65 100644 --- a/openrussian-completion.bash +++ b/openrussian-completion.bash @@ -1,33 +1,20 @@ #/bin/bash -# NOTE: Uses SQL for matching instead of compgen, since it's probably not -# a good idea to always retrieve all the words, although this happens anyway -# when completing an empty word. - _openrussian_completions() { - # NOTE: sqlite3's command-line tool does not provide a way to properly - # embed strings, so we try to escape it here. - WORD=$(echo -n "$2" | sed 's/"/""/g') - - # This is a rather convoluted way of writing - # SELECT bare FROM words WHERE bare LIKE "${WORD}%"; - # but allowing $WORD to contain accentuation characters, which can easily - # happen when cutting/pasting words from the openrussian.lua manpages or the - # internet. - # NOTE: This is merely a workaround since all completions will begin with - # $WORD including accents and end without accents, so the suggested completions - # will likely be with wrong accents. - # It seems to be impossible, at least in Bash, to rubout $WORD first. - SQL=$(printf 'SELECT "%s" || SUBSTR(bare, LENGTH(REPLACE("%s", "\u0301", ""))+1) - FROM words WHERE bare LIKE REPLACE("%s%%", "\u0301", "")' \ - "$WORD" "$WORD" "$WORD") - - # Calculate database path based on the installation path of the `openrussian` - # CLI tool. This way, we can avoid preprocessing the script during installation. - PREFIX=$(dirname $(which openrussian))/.. + # Autocompletion of all search terms can be outsourced + # to the openrussian.lua script, which has the advantage + # of taking all command line flags into account. + # + # NOTE: mapfile is used to properly capture lines containing + # whitespace + # FIXME: Does not work if the last token contains spaces + # FIXME: Also fails when trying to complete multiple tokens + # (in order to avoid white-space quoting). + mapfile -t COMPREPLY < <(openrussian -C "${COMP_WORDS[@]:1}" 2>/dev/null) - COMPREPLY=($(sqlite3 "$PREFIX/share/openrussian/openrussian-sqlite3.db" "$SQL")) + # NOTE: openrussian.lua currently does not complete switch-names. + COMPREPLY+=($(compgen -W "-Len -Lde -V -p" -- "$2")) } complete -F _openrussian_completions openrussian diff --git a/openrussian-sql.zip b/openrussian-sql.zip Binary files differnew file mode 100644 index 0000000..585e773 --- /dev/null +++ b/openrussian-sql.zip diff --git a/openrussian.lua b/openrussian.lua index b75de7f..bd25930 100755 --- a/openrussian.lua +++ b/openrussian.lua @@ -1,29 +1,69 @@ #!/usr/bin/lua5.2 local driver = require "luasql.sqlite3" +local lutf8 = require "lua-utf8" + +local ACCENT = lutf8.char(0x0301) -- Accent combining character + +local lang = os.setlocale(nil, "ctype"):match("^([^_]+)") + +local search_words = {} local function usage(stream) - stream:write("Usage: ", arg[0], " [-p] <word>\n") + stream:write("Usage: ", arg[0], " [-L<lang>] [-V] [-p] <pattern...>\n", + "\t-L<lang> Set language to <lang> (currently en or de, guessed from locale)\n", + "\t-V Verbatim matching (no case folding and inflections)\n", + "\t-p Print Troff code to stdout\n") end for i = 1, #arg do if arg[i]:sub(1, 1) == "-" then - if arg[i]:sub(2) == "p" then + local opt = arg[i]:sub(2) + + if opt:sub(1, 1) == "L" then + if #opt > 1 then + lang = opt:sub(2) + elseif i == #arg then + usage(io.stderr) + os.exit(false) + else + lang = arg[i+1] + i = i + 1 + end + elseif opt == "V" then + verbatim = true + elseif opt == "p" then use_stdout = true + elseif opt == "C" then + -- This is a "secret" command used for implementing + -- auto-completions. + -- It will usually be the first argument. + auto_complete = true else usage(io.stderr) os.exit(false) end else - search_word = arg[i] + table.insert(search_words, arg[i]) end end -if not search_word then +if #search_words == 0 then usage(io.stderr); os.exit(false) end +-- Allowing multiple arguments to be concat into the search words +-- is useful when searching for a translation which may contain +-- spaces without quoting the entire search term. +local search_word = table.concat(search_words, " ").. + (auto_complete and "*" or "") + +-- FIXME: Currently only English and German are actually +-- contained in the database, but this might change. +-- Perhaps query the availability dynamically. +if lang ~= "en" and lang ~= "de" then lang = "en" end + local function dirname(path) return path:match("^(.*)/.+$") or "." end @@ -37,35 +77,71 @@ local database = PREFIX.."/share/openrussian/openrussian-sqlite3.db" if not io.open(database) then database = "openrussian-sqlite3.db" end local out_stream -local lang = "en" local env = assert(driver.sqlite3()) local con = assert(env:connect(database)) +-- A SQL-compatible globber. +-- Necessary since globbing is usually done as part of the +-- SQL query. +-- +-- NOTE: This may be reimplemented more efficiently by translating +-- the glob pattern to a Lua pattern. +-- Unfortunately, the Glob pattern syntax appears to be undefined, +-- probably because it defaults to the system glob. +-- +-- Alternatively, we might override the MATCH function with Lua patterns +-- and use MATCH instead of GLOB, but this might be inefficient. +-- In order to make use of the query optimizer, we must either use +-- LIKE or GLOB. +-- +-- Yet another alternative might be to parse all translations into +-- a separate index, speeding up translation lookups and avoiding +-- the need for globbing in Lua here. +function glob(pattern, str) + local cur = assert(con:execute(string.format([[ + SELECT '%s' GLOB '%s' + ]], con:escape(str), con:escape(pattern)))) + local row = assert(cur:fetch()) + cur:close() + + return row ~= 0 +end + -- Turns a character followed by apostroph into a combined -- accented character. -- NOTE: This encodes the accent (u0301) in bytes, so it can be -- used for printing to stdout or into Troff code. local function map_accented(str) - return (str:gsub("'", "\xCC\x81")) + return (lutf8.gsub(str, "'", ACCENT)) end --- FIXME: This does not work for tables since tbl will count the +-- FIXME: map_accented() does not work for tables since tbl will count the -- combined character as two. Theoretically, Groff has composite characters -- like \u[u043E_0301] but they don't work for all the cyrillic -- vocals. +-- If we really wanted to, we could replace every accented character +-- with an inline macro that is defined at Troff runtime depending on the +-- output device, so we could get accented characters in PDF tables at least. local function map_tbl(str) - return (str:gsub("(..)'", "\\fI%1\\fP")) + return (lutf8.gsub(str, "(.)'", "\\fI%1\\fP")) end +-- FIXME: Apparently, there are entries without declension or empty declension +-- entries, e.g. kosha4ij. +-- These should be detected and the entire section should be omitted. local function format_declension(tag, decl_id, short_form) local cur = assert(con:execute(string.format([[ SELECT * FROM declensions WHERE id = %d ]], decl_id))) local row = assert(cur:fetch({}, "a")) cur:close() - out_stream:write(tag, ';', map_tbl(row.nom), ';', map_tbl(row.gen), ';', - map_tbl(row.dat), ';', map_tbl(row.acc), ';', - map_tbl(row.inst), ';', map_tbl(row.prep)) + + out_stream:write(tag, ';', map_tbl(row.nom or "-"), ';', + map_tbl(row.gen or "-"), ';', + map_tbl(row.dat or "-"), ';', + map_tbl(row.acc or "-"), ';', + map_tbl(row.inst or "-"), ';', + map_tbl(row.prep or "-")) if short_form then out_stream:write(';', map_tbl(short_form)) end out_stream:write('\n') end @@ -83,11 +159,14 @@ function format.noun(word_id, accented) local cur = assert(con:execute(string.format([[ SELECT * FROM nouns WHERE word_id = %d ]], word_id))) - local row = assert(cur:fetch({}, "a")) + local row = cur:fetch({}, "a") cur:close() - out_stream:write('.SH WORD\n', - map_accented(accented), ' \\-\\- noun, ') + -- NOTE: This can probably happen as with any other word category + -- (example?) + if not row then return end + + out_stream:write('.SH GENDER\n') if row.gender and row.gender ~= "" then local genders = {m = "male", f = "female", n = "neuter"} out_stream:write(genders[row.gender], ', ') @@ -95,12 +174,16 @@ function format.noun(word_id, accented) out_stream:write(row.animate == 1 and 'animate' or 'inanimate', '\n') if row.partner and row.partner ~= "" then - -- FIXME: What exactly is a noun "partner"? - -- Seems to be used mostly for male/female pairs etc. + -- NOTE: Noun "partners" seem to be male/female counterparts. + -- FIXME: It would also be nice to include an accented version, + -- but since the DB lists the partner as a string instead of + -- word_id, finding the right entry could be unreliable out_stream:write('.SH PARTNER\n', row.partner, '\n') end + -- FIXME: Rotate this table (only two columns: singular and plural). + -- Lines should be short. out_stream:write('.SH DECLENSION\n', '.TS\n', 'allbox,tab(;);\n', @@ -128,12 +211,18 @@ function format.adjective(word_id, accented) local cur = assert(con:execute(string.format([[ SELECT * FROM adjectives WHERE word_id = %d ]], word_id))) - local row = assert(cur:fetch({}, "a")) + local row = cur:fetch({}, "a") cur:close() - out_stream:write('.SH WORD\n', - map_accented(accented), ' \\-\\- adjective\n') + -- NOTE: Seldomly (e.g. nesomnenno), there is no entry in adjectives + if not row then return end + + --out_stream:write('.SH CATEGORY\n', + -- 'adjective\n') + -- FIXME: Rotate this table (columns will be gender+plural). + -- Lines should be short. + -- FIXME: Short form not always present out_stream:write('.SH DECLENSION\n', '.TS\n', 'allbox,tab(;);\n', @@ -146,36 +235,52 @@ function format.adjective(word_id, accented) format_declension('Plural', row.decl_pl_id, row.short_pl) out_stream:write('.TE\n') - if row.comparative then + if row.comparative and row.comparative ~= "" then out_stream:write('.SH COMPARATIVE\n', map_accented(row.comparative), '\n') end - if row.superlative then + if row.superlative and row.superlative ~= "" then out_stream:write('.SH SUPERLATIVE\n', map_accented(row.superlative), '\n') end end +-- NOTE: There is no separate table for adverbs +-- Currently, we wouldn't print more than the category, which is also in the +-- header, so it is omitted. +function format.adverb(word_id, accented) + --out_stream:write('.SH CATEGORY\n', + -- 'adverb\n') +end + function format.verb(word_id, accented) local cur = assert(con:execute(string.format([[ SELECT * FROM verbs JOIN conjugations ON verbs.presfut_conj_id = conjugations.id WHERE verbs.word_id = %d ]], word_id))) - local row = assert(cur:fetch({}, "a")) + local row = cur:fetch({}, "a") cur:close() - out_stream:write('.SH WORD\n', - map_accented(accented), ' \\-\\- verb') - if row.aspect then out_stream:write(', ', row.aspect) end - out_stream:write('\n') + -- NOTE: Seldomly (e.g. est' -- to be), there is no entry in verbs + if not row then return end + + if row.aspect then + out_stream:write('.SH ASPECT\n', + row.aspect, '\n') + end if row.partner and row.partner ~= "" then - -- NOTE: Verb partners seem to be the aspect partners + -- NOTE: Verb partners seem to be the aspect partners. + -- They are either comma or semicolon separated. + -- FIXME: It would also be nice to include an accented version, + -- but since the DB lists the partner as a string instead of + -- word_id, finding the right entry could be unreliable out_stream:write('.SH PARTNER\n', - row.partner, '\n') + lutf8.gsub(row.partner, "[;,]", ", "), '\n') end + -- FIXME: Conjugation sometimes empty (e.g. widat') -- FIXME: Can we assume that verbs without specified aspect are always -- perfective? out_stream:write('.SH ', row.aspect == "imperfective" and 'PRESENT\n' or 'FUTURE\n', @@ -200,22 +305,26 @@ function format.verb(word_id, accented) end function format.other(word_id, accented) - out_stream:write('.SH WORD\n', - map_accented(accented), '\n') + --out_stream:write('.SH CATEGORY\n', + -- 'other\n') end -function get_translations(word_id) +local function get_translations(word_id) local ret = {} -- FIXME: Fetch other translations if primary -- language is not available local cur = assert(con:execute(string.format([[ SELECT tl FROM translations - WHERE word_id = %d AND lang = "%s" - ]], word_id, lang))) + WHERE word_id = %d AND lang = '%s' + ]], word_id, con:escape(lang)))) local row = cur:fetch({}, "a") while row do - table.insert(ret, row.tl) + -- NOTE: One entry might contain many comma-separated + -- translations + for word in lutf8.gmatch(row.tl..", ", "(.-), ") do + table.insert(ret, word) + end row = cur:fetch({}, "a") end cur:close() @@ -223,13 +332,32 @@ function get_translations(word_id) return ret end --- NOTE: This lets SQL strip the accent char from the input, which --- allows users to cut and paste from generated output while we don't --- have to deal with Unicode in Lua. +-- Format reference to row from the words-table. +-- FIXME: Not printed bold since bold text and accents +-- don't work together (URxvt). +local function get_reference(word_row) + return map_accented(word_row.accented or word_row.bare).. + '('..(word_row.type or "other")..')' +end + +-- NOTE: This strips the accent char, so users can cut and paste from +-- generated output. +-- This is done from Lua, since the right-hand side of GLOB should be a constant +-- to allow optimizations: +-- https://www.sqlite.org/optoverview.html#the_like_optimization +-- +-- TODO: Double-check whether the GLOB is actually optimized. +-- Theoretically, we need COLLATE BINARY for that. +-- +-- FIXME: Case-folding UTF8 / Collating is not supported by SQLite3. +-- If we want to support case-insensitive matching, it is mandatory, though. +-- Could be done using the ICU extension: +-- https://www.sqlite.org/src/artifact?ci=trunk&filename=ext/icu/README.txt local cur = assert(con:execute(string.format([[ - SELECT bare, accented, type, words.id AS word_id - FROM words WHERE bare = REPLACE("%s", CHAR(0x0301), "") -]], search_word))) + SELECT bare AS completions, * FROM words + WHERE LIKELY(disabled = 0) AND bare GLOB '%s' + ORDER BY rank +]], con:escape(lutf8.gsub(search_word, ACCENT, ""))))) local rows = {} local row @@ -240,17 +368,189 @@ until not row cur:close() +if not verbatim then + --[==[ + -- FIXME: These queries are tooo sloooow! Perhaps that's why the openrussion.org + -- website does not allow searching by declension prefixes. + -- This is because of the need for string-concatenations for every possible word + -- and because the GLOBbing cannot be optimized, even in the most common cases. + -- FIXME: This does not find braced-terms. Glob patterns are simply not powerful + -- enough to express "optional brace". + -- We'd probably need regexp for that. + cur = assert(con:execute(string.format([[ + SELECT REPLACE(temp, "'", "") AS completions, words.* FROM words JOIN ( + -- Search word might be a noun or adjective declension + SELECT nom||","||gen||","||dat||","||acc||","||inst||","||prep AS temp, word_id + FROM declensions + UNION + -- Search word might be a special adjective inflection + SELECT comparative||","||superlative||","|| + short_m||","||short_f||","||short_n||","||short_pl AS temp, word_id + FROM adjectives + UNION + -- Search word might be a verb imperative, past form or conjugation + SELECT imperative_sg||","||imperative_pl||","||past_m||","||past_f||","||past_n||","||past_pl|| + sg1||","||sg2||","||sg3||","||pl1||","||pl2||","||pl3 AS temp, verbs.word_id + FROM verbs LEFT JOIN conjugations ON presfut_conj_id = conjugations.id + ) ON words.id = word_id + WHERE LIKELY(disabled = 0) AND ","||completions||"," GLOB '*,%s,*' + ORDER BY rank + ]], con:escape(lutf8.gsub(search_word, ACCENT, ""))))) + + -- This is an alternative to the above query. + -- It eliminates the concatenations, but has to iterate many tables redundantly. + -- Effectively it is twice as slow as the above query... + cur = assert(con:execute(string.format([[ + SELECT REPLACE(temp, "'", "") AS completions, words.* FROM words JOIN ( + -- Search word might be a noun or adjective declension + SELECT nom AS temp, word_id FROM declensions + UNION ALL + SELECT gen AS temp, word_id FROM declensions + UNION ALL + SELECT dat AS temp, word_id FROM declensions + UNION ALL + SELECT acc AS temp, word_id FROM declensions + UNION ALL + SELECT inst AS temp, word_id FROM declensions + UNION ALL + SELECT prep AS temp, word_id FROM declensions + UNION ALL + -- Search word might be a special adjective inflection + SELECT comparative AS temp, word_id FROM adjectives + UNION ALL + SELECT superlative AS temp, word_id FROM adjectives + UNION ALL + SELECT short_m AS temp, word_id FROM adjectives + UNION ALL + SELECT short_f AS temp, word_id FROM adjectives + UNION ALL + SELECT short_n AS temp, word_id FROM adjectives + UNION ALL + SELECT short_pl AS temp, word_id FROM adjectives + UNION ALL + -- Search word might be a verb imperative or past form + SELECT imperative_sg AS temp, word_id FROM verbs + UNION ALL + SELECT imperative_pl AS temp, word_id FROM verbs + UNION ALL + SELECT past_m AS temp, word_id FROM verbs + UNION ALL + SELECT past_f AS temp, word_id FROM verbs + UNION ALL + SELECT past_n AS temp, word_id FROM verbs + UNION ALL + SELECT past_pl AS temp, word_id FROM verbs + UNION ALL + -- Search word might be a verb conjugation + SELECT sg1 AS temp, word_id FROM conjugations + UNION ALL + SELECT sg2 AS temp, word_id FROM conjugations + UNION ALL + SELECT sg3 AS temp, word_id FROM conjugations + UNION ALL + SELECT pl1 AS temp, word_id FROM conjugations + UNION ALL + SELECT pl2 AS temp, word_id FROM conjugations + UNION ALL + SELECT pl3 AS temp, word_id FROM conjugations + ) ON words.id = word_id + WHERE LIKELY(disabled = 0) AND completions GLOB '%s' + ORDER BY rank + ]], con:escape(lutf8.gsub(search_word, ACCENT, ""))))) + ]==] + + -- This query uses a new `bare_inflections` table, since all queries + -- using existing tables (see above) are way too slow, especially for + -- autocompletions. + -- NOTE: The right-hand side of GLOB must be a constant, so that it can be + -- optimized using the index. + cur = assert(con:execute(string.format([[ + SELECT bare_inflections.bare AS completions, words.* + FROM words JOIN bare_inflections ON words.id = word_id + WHERE LIKELY(disabled = 0) AND completions GLOB '%s' + ORDER BY rank + ]], con:escape(lutf8.gsub(search_word, ACCENT, ""))))) + + repeat + row = cur:fetch({}, "a") + table.insert(rows, row) + until not row + + cur:close() +end + +-- Only if we do not find a Russian word, we try to find a translation. +-- This is not wrapped with the above query into one using a LEFT JOIN since +-- two queries are significantly faster - probably because of having to perform less +-- string concatenations. +if #rows == 0 then + -- NOTE: The translation entry frequently contains a comma-separated + -- list of translations + -- + -- FIXME: Case folding only works for ASCII, which should be sufficient for + -- German/English text (almost)... + -- FIXME: The string concatenation is a real slow-down and the GLOB cannot + -- be optimized. + -- Perhaps the translations should be in their own (new) indexed table. + cur = assert(con:execute(string.format([[ + SELECT %s(", "||tl||", ") AS completions, words.* + FROM words JOIN translations ON words.id = word_id + WHERE LIKELY(disabled = 0) AND lang = '%s' AND completions GLOB %s('*, %s, *') + ORDER BY rank + ]], verbatim and "" or "LOWER", con:escape(lang), verbatim and "" or "LOWER", con:escape(search_word)))) + + repeat + row = cur:fetch({}, "a") + table.insert(rows, row) + until not row + + cur:close() +end + +if auto_complete then + -- FIXME: See above for notes on case-folding + local search_word_bare = lutf8.gsub(search_word, ACCENT, "") + search_word_bare = verbatim and search_word_bare or search_word_bare:lower() + + for _, row in ipairs(rows) do + -- NOTE: This code is reused for Russian base words, inflections and translated lookups, + -- so there is a common `completions` column. + -- Russian words can be treated like single-word translations. + -- Terms in this column can be comma-separated with and without spaces and + -- there may be braces. + for word in lutf8.gmatch(row.completions..",", " *%(?(.-)%)?,") do + if glob(search_word, word) then + io.stdout:write(search_words[#search_words], + lutf8.sub(word, lutf8.len(search_word_bare)), "\n") + end + end + end + + os.exit(true) +end + if #rows == 0 then io.stderr:write('Word "', search_word, '" not found!\n') os.exit(false) end -if #rows == 1 then - row = rows[1] +-- Filter out duplicates +local word_ids = {} +local unique_rows = {} + +for _, row in ipairs(rows) do + if not word_ids[row.id] then + table.insert(unique_rows, row) + word_ids[row.id] = true + end +end + +if #unique_rows == 1 then + row = unique_rows[1] else - for i, row in ipairs(rows) do + for i, row in ipairs(unique_rows) do local word_accented = row.accented or row.bare - local tl = get_translations(row.word_id) + local tl = get_translations(row.id) io.stdout:write(i, ") ", map_accented(word_accented)) if #tl > 0 then io.stdout:write(" (", table.concat(tl, ", "), ")") end @@ -258,68 +558,143 @@ else end repeat - io.stdout:write("Show [1..", #rows, ", press enter to cancel]? "):flush() + io.stdout:write("Show [1..", #unique_rows, ", press enter to cancel]? "):flush() local choice = io.stdin:read():lower() if choice == "" or choice == "q" then os.exit() end - row = rows[tonumber(choice)] + row = unique_rows[tonumber(choice)] until row end -local word_id = row.word_id -local word_type = row.type or "other" +local word_id = row.id -- NOTE: Some words (e.g. personal pronouns) apparently do not -- come with accents!? local word_accented = row.accented or row.bare +local word_derived_from = row.derived_from_word_id +local word_audio = row.audio +local word_usage = row["usage_"..lang] +local word_type = row.type or "other" -- Open stream only now, after no more messages have to be written to -- stdout/stderr. out_stream = assert(use_stdout and io.stdout or io.popen("man /dev/stdin", "w")) +-- NOTE: The headers and footers shouldn't contain critical information +-- since they might not be printed at all. out_stream:write('.\\" t\n', - '.TH "', search_word, '" "', word_type, '"\n') + '.TH "', row.bare, '" "', word_type, '" "') +if row.rank then + out_stream:write('#', row.rank, row.level and ' ('..row.level..')' or '') +else + out_stream:write(row.level) +end +out_stream:write('" "openrussian.lua" "openrussian.org"\n') + +-- +-- Generic WORD section with translation. +-- +out_stream:write('.SH WORD\n', + map_accented(word_accented)) +local tl = get_translations(word_id) +if #tl > 0 then + out_stream:write(' \\-\\- ', table.concat(tl, ', ')) +end +out_stream:write('\n') -- -- Word-specific sections +-- NOTE: word_accented is required only for format.noun() and could be +-- avoided altogether. -- format[word_type](word_id, word_accented) -- -- Generic sections -- -local tl = get_translations(word_id) -if #tl > 0 then - out_stream:write('.SH TRANSLATION\n', - table.concat(tl, ', '), '\n') +if word_usage then + out_stream:write('.SH USAGE\n', + word_usage, '\n') +end + +-- FIXME: Perhaps this should rather be part of the SEE ALSO section +if word_derived_from then + cur = assert(con:execute(string.format([[ + SELECT bare, accented, type FROM words + WHERE LIKELY(disabled = 0) AND id = %d + ]], word_derived_from))) + row = assert(cur:fetch({}, "a")) + cur:close() + + out_stream:write('.SH DERIVED FROM\n', + get_reference(row), '\n') end -- --- NOTE: There can be many examples, so print them last. +-- NOTE: There can be many examples, so print them late. -- cur = assert(con:execute(string.format([[ SELECT ru, start, length, tl FROM sentences_words JOIN sentences ON sentence_id = sentences.id - WHERE word_id = %d AND lang = "%s" -]], word_id, lang))) + WHERE word_id = %d AND lang = '%s' +]], word_id, con:escape(lang)))) row = cur:fetch({}, "a") if row then out_stream:write('.SH EXAMPLES\n') repeat - -- FIXME: Highlight search word in sentences. - -- start/length are apparently in characters - -- instead of bytes. - --[[ - local ru_hl = row.ru:sub(1, row.start)..'\\fI'.. - row.ru:sub(row.start+1, row.start+1+row.length)..'\\fP'.. - row.ru:sub(row.start+1+row.length+1) - ]] + -- FIXME: The accent is not always available in the default + -- italic font when formatting for PDF. + local ru_hl = lutf8.sub(row.ru, 1, row.start)..'\\fI'.. + lutf8.sub(row.ru, row.start+1, row.start+1+row.length-1)..'\\fP'.. + lutf8.sub(row.ru, row.start+1+row.length) + out_stream:write('.TP\n', - map_accented(row.ru), '\n', + map_accented(ru_hl), '\n', row.tl, '\n') row = cur:fetch({}, "a") until not row +end +cur:close() +-- Audio recordings might be useful occasionally, but this is an offline/terminal +-- application, so it makes sense to print them last (like URLs in manpages). +-- +-- NOTE: There is an UE man-macro, but it doesn't seem to be very helpful here and +-- seems to bring no advantages when formatting as a PDF. +-- It could be typset in the default fixed-width font (\fC), but it does not contain +-- cyrillic characters, so we don't do that either. +if word_audio then + out_stream:write('.SH AUDIO\n', + word_audio, '\n') end + +-- Disable adjusting (space-stretching) for the related-word lists. +-- Don't forget to enable this again if something follows these sections. +out_stream:write('.na\n') + +-- NOTE: The results are grouped by relation, so that they can be +-- easily printed in one section per relation. +-- Unfortunately, we cannot define custom collating sequences with LuaSQL. +-- FIXME: Print this under a single SEE ALSO master section? +-- FIXME: Results should perhaps be ordered by `type`? +cur = assert(con:execute(string.format([[ + SELECT bare, accented, type, relation + FROM words_rels JOIN words ON rel_word_id = words.id + WHERE LIKELY(disabled = 0) AND words_rels.word_id = %d + ORDER BY relation, rank +]], word_id))) + +local cur_relation +row = cur:fetch({}, "a") +while row do + if cur_relation ~= row.relation then + cur_relation = row.relation + out_stream:write('.SH ', cur_relation:upper(), '\n') + end + out_stream:write(get_reference(row)) + row = cur:fetch({}, "a") + out_stream:write(row and row.relation == cur_relation and ', ' or '\n') +end + cur:close() -- diff --git a/postprocess.sql b/postprocess.sql new file mode 100644 index 0000000..5976cb1 --- /dev/null +++ b/postprocess.sql @@ -0,0 +1,78 @@ +-- Drop unused tables, hopefully saving a few megabytes. +-- `expressions_words` is for excercises??? +DROP TABLE expressions_words; +-- `categories_words2` references `categories` but is not in the database!? +DROP TABLE categories_words2; + +-- This table is especially added to aid lookups and autocompletions on all +-- possible inflections. +-- This is significantly faster than using the existing tables. +-- +-- FIXME: Some of the columns contain multiple comma-separated values +-- (sometimes in braces), so we probably need a Lua script for initializion. +-- TODO: Translations should probably be in this table, or a separate one as well. +-- But see above. +CREATE TABLE bare_inflections (`word_id` INTEGER NOT NULL, `bare` VARCHAR(100) NOT NULL COLLATE BINARY); +-- Indexing the `bare` column allows ultra-fast lookups even via GLOB +-- in most common cases. For this optimization to work, the column must +-- also be BINARY collated. See also: +-- https://www.sqlite.org/optoverview.html#the_like_optimization +CREATE INDEX idx_bare_inflections_bare ON bare_inflections (`bare`); +INSERT INTO bare_inflections SELECT word_id, REPLACE(temp, "'", "") AS bare +FROM ( + -- Search word might be a noun or adjective declension + SELECT word_id, nom AS temp FROM declensions + UNION ALL + SELECT word_id, gen AS temp FROM declensions + UNION ALL + SELECT word_id, dat AS temp FROM declensions + UNION ALL + SELECT word_id, acc AS temp FROM declensions + UNION ALL + SELECT word_id, inst AS temp FROM declensions + UNION ALL + SELECT word_id, prep AS temp FROM declensions + UNION ALL + -- Search word might be a special adjective inflection + SELECT word_id, comparative AS temp FROM adjectives + UNION ALL + SELECT word_id, superlative AS temp FROM adjectives + UNION ALL + SELECT word_id, short_m AS temp FROM adjectives + UNION ALL + SELECT word_id, short_f AS temp FROM adjectives + UNION ALL + SELECT word_id, short_n AS temp FROM adjectives + UNION ALL + SELECT word_id, short_pl AS temp FROM adjectives + UNION ALL + -- Search word might be a verb imperative or past form + SELECT word_id, imperative_sg AS temp FROM verbs + UNION ALL + SELECT word_id, imperative_pl AS temp FROM verbs + UNION ALL + SELECT word_id, past_m AS temp FROM verbs + UNION ALL + SELECT word_id, past_f AS temp FROM verbs + UNION ALL + SELECT word_id, past_n AS temp FROM verbs + UNION ALL + SELECT word_id, past_pl AS temp FROM verbs + UNION ALL + -- Search word might be a verb conjugation + SELECT word_id, sg1 AS temp FROM conjugations + UNION ALL + SELECT word_id, sg2 AS temp FROM conjugations + UNION ALL + SELECT word_id, sg3 AS temp FROM conjugations + UNION ALL + SELECT word_id, pl1 AS temp FROM conjugations + UNION ALL + SELECT word_id, pl2 AS temp FROM conjugations + UNION ALL + SELECT word_id, pl3 AS temp FROM conjugations +) +WHERE bare <> ""; + +-- Saves a few megabytes +VACUUM; |