aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2019-04-18 23:20:46 +0300
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2019-04-18 23:20:46 +0300
commit481c69be02c95fbf20567d8de617315484e2d692 (patch)
tree71bf406543586f278a5e779a0befa57f0fe529e9
parent07d4fa1fe85eba554becb20bb4a872acb320a423 (diff)
downloadopenrussian-cli-481c69be02c95fbf20567d8de617315484e2d692.tar.gz
major changes (unfortunately not done systematically)
* keep the openrussian-sql.zip to avoid bitrot (sudden breakage when building the project against a recent DB) * allow lookups against inflections. A special table `bare_inflections` was added to the DB in order to speed this up. * authentic autocompletions taking every parameter into account, using a magic `-C` parameter * language selection via locale or -L<lang> * translation lookup * allow multiple search terms on the command line without escaping * all SQL strings are properly escaped now to avoid (accidental) code insertion * luautf8 is mandatory now and used much more, which is safer than handling Unicode strings with the builtin functions * ignore "disable" database entries * added option -V (verbatim) * more protections against empty fields * print "DERIVED FROM", "AUDIO", "USAGE" and various word relation sections * print word ranking and level (A1-C2...)
-rw-r--r--.gitignore3
-rw-r--r--Makefile16
-rw-r--r--openrussian-completion.bash37
-rw-r--r--openrussian-sql.zipbin0 -> 35607877 bytes
-rwxr-xr-xopenrussian.lua505
-rw-r--r--postprocess.sql78
6 files changed, 542 insertions, 97 deletions
diff --git a/.gitignore b/.gitignore
index 8308aa1..ff004d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
openrussian
-openrussian-sql.zip
openrussian-sqlite3.db
+# This file is currently versioned, so it should not be ignored.
+#openrussian-sql.zip
diff --git a/Makefile b/Makefile
index 8743043..a6b777f 100644
--- a/Makefile
+++ b/Makefile
@@ -13,16 +13,20 @@ openrussian : openrussian.lua
chmod a+x $@
# Marking the ZIP intermediate makes sure it is automatically deleted after
-# generating the SQLITE database while we don't always re-download it.
+# generating the SQLite database while we don't always re-download it.
# Effectively, it will behave like a temporary file.
-.INTERMEDIATE: openrussian-sql.zip
+# NOTE: This is disabled for the time being since the database schema changes
+# from time to time, so a rebuild could easily break the script.
+# Instead, we add the file to Git, so every clone is guaranteed to contain
+# a database matching the openrussian.lua script.
+#.INTERMEDIATE: openrussian-sql.zip
openrussian-sql.zip:
wget -O $@ 'https://en.openrussian.org/downloads/openrussian-sql.zip'
-# NOTE: VACUUMing the database saves a few megabytes
-openrussian-sqlite3.db : openrussian-sql.zip mysql2sqlite
+openrussian-sqlite3.db : openrussian-sql.zip mysql2sqlite postprocess.sql
+ $(RM) $@
unzip -p $< openrussian.sql | ./mysql2sqlite - | sqlite3 $@
- sqlite3 $@ VACUUM
+ sqlite3 $@ -batch <postprocess.sql
# NOTE: Installation of the Bash completions depends on the Debain bash-completion
# package being installed or something similar
@@ -33,4 +37,4 @@ install : openrussian openrussian-sqlite3.db openrussian-completion.bash
cp openrussian-completion.bash $(DESTDIR)$(COMPLETIONSDIR)/openrussian
clean:
- $(RM) -f openrussian openrussian-sql.zip openrussian-sqlite3.db
+ $(RM) openrussian openrussian-sql.zip openrussian-sqlite3.db
diff --git a/openrussian-completion.bash b/openrussian-completion.bash
index fd27c06..9892d65 100644
--- a/openrussian-completion.bash
+++ b/openrussian-completion.bash
@@ -1,33 +1,20 @@
#/bin/bash
-# NOTE: Uses SQL for matching instead of compgen, since it's probably not
-# a good idea to always retrieve all the words, although this happens anyway
-# when completing an empty word.
-
_openrussian_completions()
{
- # NOTE: sqlite3's command-line tool does not provide a way to properly
- # embed strings, so we try to escape it here.
- WORD=$(echo -n "$2" | sed 's/"/""/g')
-
- # This is a rather convoluted way of writing
- # SELECT bare FROM words WHERE bare LIKE "${WORD}%";
- # but allowing $WORD to contain accentuation characters, which can easily
- # happen when cutting/pasting words from the openrussian.lua manpages or the
- # internet.
- # NOTE: This is merely a workaround since all completions will begin with
- # $WORD including accents and end without accents, so the suggested completions
- # will likely be with wrong accents.
- # It seems to be impossible, at least in Bash, to rubout $WORD first.
- SQL=$(printf 'SELECT "%s" || SUBSTR(bare, LENGTH(REPLACE("%s", "\u0301", ""))+1)
- FROM words WHERE bare LIKE REPLACE("%s%%", "\u0301", "")' \
- "$WORD" "$WORD" "$WORD")
-
- # Calculate database path based on the installation path of the `openrussian`
- # CLI tool. This way, we can avoid preprocessing the script during installation.
- PREFIX=$(dirname $(which openrussian))/..
+ # Autocompletion of all search terms can be outsourced
+ # to the openrussian.lua script, which has the advantage
+ # of taking all command line flags into account.
+ #
+ # NOTE: mapfile is used to properly capture lines containing
+ # whitespace
+ # FIXME: Does not work if the last token contains spaces
+ # FIXME: Also fails when trying to complete multiple tokens
+ # (in order to avoid white-space quoting).
+ mapfile -t COMPREPLY < <(openrussian -C "${COMP_WORDS[@]:1}" 2>/dev/null)
- COMPREPLY=($(sqlite3 "$PREFIX/share/openrussian/openrussian-sqlite3.db" "$SQL"))
+ # NOTE: openrussian.lua currently does not complete switch-names.
+ COMPREPLY+=($(compgen -W "-Len -Lde -V -p" -- "$2"))
}
complete -F _openrussian_completions openrussian
diff --git a/openrussian-sql.zip b/openrussian-sql.zip
new file mode 100644
index 0000000..585e773
--- /dev/null
+++ b/openrussian-sql.zip
Binary files differ
diff --git a/openrussian.lua b/openrussian.lua
index b75de7f..bd25930 100755
--- a/openrussian.lua
+++ b/openrussian.lua
@@ -1,29 +1,69 @@
#!/usr/bin/lua5.2
local driver = require "luasql.sqlite3"
+local lutf8 = require "lua-utf8"
+
+local ACCENT = lutf8.char(0x0301) -- Accent combining character
+
+local lang = os.setlocale(nil, "ctype"):match("^([^_]+)")
+
+local search_words = {}
local function usage(stream)
- stream:write("Usage: ", arg[0], " [-p] <word>\n")
+ stream:write("Usage: ", arg[0], " [-L<lang>] [-V] [-p] <pattern...>\n",
+ "\t-L<lang> Set language to <lang> (currently en or de, guessed from locale)\n",
+ "\t-V Verbatim matching (no case folding and inflections)\n",
+ "\t-p Print Troff code to stdout\n")
end
for i = 1, #arg do
if arg[i]:sub(1, 1) == "-" then
- if arg[i]:sub(2) == "p" then
+ local opt = arg[i]:sub(2)
+
+ if opt:sub(1, 1) == "L" then
+ if #opt > 1 then
+ lang = opt:sub(2)
+ elseif i == #arg then
+ usage(io.stderr)
+ os.exit(false)
+ else
+ lang = arg[i+1]
+ i = i + 1
+ end
+ elseif opt == "V" then
+ verbatim = true
+ elseif opt == "p" then
use_stdout = true
+ elseif opt == "C" then
+ -- This is a "secret" command used for implementing
+ -- auto-completions.
+ -- It will usually be the first argument.
+ auto_complete = true
else
usage(io.stderr)
os.exit(false)
end
else
- search_word = arg[i]
+ table.insert(search_words, arg[i])
end
end
-if not search_word then
+if #search_words == 0 then
usage(io.stderr);
os.exit(false)
end
+-- Allowing multiple arguments to be concat into the search words
+-- is useful when searching for a translation which may contain
+-- spaces without quoting the entire search term.
+local search_word = table.concat(search_words, " ")..
+ (auto_complete and "*" or "")
+
+-- FIXME: Currently only English and German are actually
+-- contained in the database, but this might change.
+-- Perhaps query the availability dynamically.
+if lang ~= "en" and lang ~= "de" then lang = "en" end
+
local function dirname(path)
return path:match("^(.*)/.+$") or "."
end
@@ -37,35 +77,71 @@ local database = PREFIX.."/share/openrussian/openrussian-sqlite3.db"
if not io.open(database) then database = "openrussian-sqlite3.db" end
local out_stream
-local lang = "en"
local env = assert(driver.sqlite3())
local con = assert(env:connect(database))
+-- A SQL-compatible globber.
+-- Necessary since globbing is usually done as part of the
+-- SQL query.
+--
+-- NOTE: This may be reimplemented more efficiently by translating
+-- the glob pattern to a Lua pattern.
+-- Unfortunately, the Glob pattern syntax appears to be undefined,
+-- probably because it defaults to the system glob.
+--
+-- Alternatively, we might override the MATCH function with Lua patterns
+-- and use MATCH instead of GLOB, but this might be inefficient.
+-- In order to make use of the query optimizer, we must either use
+-- LIKE or GLOB.
+--
+-- Yet another alternative might be to parse all translations into
+-- a separate index, speeding up translation lookups and avoiding
+-- the need for globbing in Lua here.
+function glob(pattern, str)
+ local cur = assert(con:execute(string.format([[
+ SELECT '%s' GLOB '%s'
+ ]], con:escape(str), con:escape(pattern))))
+ local row = assert(cur:fetch())
+ cur:close()
+
+ return row ~= 0
+end
+
-- Turns a character followed by apostroph into a combined
-- accented character.
-- NOTE: This encodes the accent (u0301) in bytes, so it can be
-- used for printing to stdout or into Troff code.
local function map_accented(str)
- return (str:gsub("'", "\xCC\x81"))
+ return (lutf8.gsub(str, "'", ACCENT))
end
--- FIXME: This does not work for tables since tbl will count the
+-- FIXME: map_accented() does not work for tables since tbl will count the
-- combined character as two. Theoretically, Groff has composite characters
-- like \u[u043E_0301] but they don't work for all the cyrillic
-- vocals.
+-- If we really wanted to, we could replace every accented character
+-- with an inline macro that is defined at Troff runtime depending on the
+-- output device, so we could get accented characters in PDF tables at least.
local function map_tbl(str)
- return (str:gsub("(..)'", "\\fI%1\\fP"))
+ return (lutf8.gsub(str, "(.)'", "\\fI%1\\fP"))
end
+-- FIXME: Apparently, there are entries without declension or empty declension
+-- entries, e.g. kosha4ij.
+-- These should be detected and the entire section should be omitted.
local function format_declension(tag, decl_id, short_form)
local cur = assert(con:execute(string.format([[
SELECT * FROM declensions WHERE id = %d
]], decl_id)))
local row = assert(cur:fetch({}, "a"))
cur:close()
- out_stream:write(tag, ';', map_tbl(row.nom), ';', map_tbl(row.gen), ';',
- map_tbl(row.dat), ';', map_tbl(row.acc), ';',
- map_tbl(row.inst), ';', map_tbl(row.prep))
+
+ out_stream:write(tag, ';', map_tbl(row.nom or "-"), ';',
+ map_tbl(row.gen or "-"), ';',
+ map_tbl(row.dat or "-"), ';',
+ map_tbl(row.acc or "-"), ';',
+ map_tbl(row.inst or "-"), ';',
+ map_tbl(row.prep or "-"))
if short_form then out_stream:write(';', map_tbl(short_form)) end
out_stream:write('\n')
end
@@ -83,11 +159,14 @@ function format.noun(word_id, accented)
local cur = assert(con:execute(string.format([[
SELECT * FROM nouns WHERE word_id = %d
]], word_id)))
- local row = assert(cur:fetch({}, "a"))
+ local row = cur:fetch({}, "a")
cur:close()
- out_stream:write('.SH WORD\n',
- map_accented(accented), ' \\-\\- noun, ')
+ -- NOTE: This can probably happen as with any other word category
+ -- (example?)
+ if not row then return end
+
+ out_stream:write('.SH GENDER\n')
if row.gender and row.gender ~= "" then
local genders = {m = "male", f = "female", n = "neuter"}
out_stream:write(genders[row.gender], ', ')
@@ -95,12 +174,16 @@ function format.noun(word_id, accented)
out_stream:write(row.animate == 1 and 'animate' or 'inanimate', '\n')
if row.partner and row.partner ~= "" then
- -- FIXME: What exactly is a noun "partner"?
- -- Seems to be used mostly for male/female pairs etc.
+ -- NOTE: Noun "partners" seem to be male/female counterparts.
+ -- FIXME: It would also be nice to include an accented version,
+ -- but since the DB lists the partner as a string instead of
+ -- word_id, finding the right entry could be unreliable
out_stream:write('.SH PARTNER\n',
row.partner, '\n')
end
+ -- FIXME: Rotate this table (only two columns: singular and plural).
+ -- Lines should be short.
out_stream:write('.SH DECLENSION\n',
'.TS\n',
'allbox,tab(;);\n',
@@ -128,12 +211,18 @@ function format.adjective(word_id, accented)
local cur = assert(con:execute(string.format([[
SELECT * FROM adjectives WHERE word_id = %d
]], word_id)))
- local row = assert(cur:fetch({}, "a"))
+ local row = cur:fetch({}, "a")
cur:close()
- out_stream:write('.SH WORD\n',
- map_accented(accented), ' \\-\\- adjective\n')
+ -- NOTE: Seldomly (e.g. nesomnenno), there is no entry in adjectives
+ if not row then return end
+
+ --out_stream:write('.SH CATEGORY\n',
+ -- 'adjective\n')
+ -- FIXME: Rotate this table (columns will be gender+plural).
+ -- Lines should be short.
+ -- FIXME: Short form not always present
out_stream:write('.SH DECLENSION\n',
'.TS\n',
'allbox,tab(;);\n',
@@ -146,36 +235,52 @@ function format.adjective(word_id, accented)
format_declension('Plural', row.decl_pl_id, row.short_pl)
out_stream:write('.TE\n')
- if row.comparative then
+ if row.comparative and row.comparative ~= "" then
out_stream:write('.SH COMPARATIVE\n',
map_accented(row.comparative), '\n')
end
- if row.superlative then
+ if row.superlative and row.superlative ~= "" then
out_stream:write('.SH SUPERLATIVE\n',
map_accented(row.superlative), '\n')
end
end
+-- NOTE: There is no separate table for adverbs
+-- Currently, we wouldn't print more than the category, which is also in the
+-- header, so it is omitted.
+function format.adverb(word_id, accented)
+ --out_stream:write('.SH CATEGORY\n',
+ -- 'adverb\n')
+end
+
function format.verb(word_id, accented)
local cur = assert(con:execute(string.format([[
SELECT * FROM verbs JOIN conjugations ON verbs.presfut_conj_id = conjugations.id
WHERE verbs.word_id = %d
]], word_id)))
- local row = assert(cur:fetch({}, "a"))
+ local row = cur:fetch({}, "a")
cur:close()
- out_stream:write('.SH WORD\n',
- map_accented(accented), ' \\-\\- verb')
- if row.aspect then out_stream:write(', ', row.aspect) end
- out_stream:write('\n')
+ -- NOTE: Seldomly (e.g. est' -- to be), there is no entry in verbs
+ if not row then return end
+
+ if row.aspect then
+ out_stream:write('.SH ASPECT\n',
+ row.aspect, '\n')
+ end
if row.partner and row.partner ~= "" then
- -- NOTE: Verb partners seem to be the aspect partners
+ -- NOTE: Verb partners seem to be the aspect partners.
+ -- They are either comma or semicolon separated.
+ -- FIXME: It would also be nice to include an accented version,
+ -- but since the DB lists the partner as a string instead of
+ -- word_id, finding the right entry could be unreliable
out_stream:write('.SH PARTNER\n',
- row.partner, '\n')
+ lutf8.gsub(row.partner, "[;,]", ", "), '\n')
end
+ -- FIXME: Conjugation sometimes empty (e.g. widat')
-- FIXME: Can we assume that verbs without specified aspect are always
-- perfective?
out_stream:write('.SH ', row.aspect == "imperfective" and 'PRESENT\n' or 'FUTURE\n',
@@ -200,22 +305,26 @@ function format.verb(word_id, accented)
end
function format.other(word_id, accented)
- out_stream:write('.SH WORD\n',
- map_accented(accented), '\n')
+ --out_stream:write('.SH CATEGORY\n',
+ -- 'other\n')
end
-function get_translations(word_id)
+local function get_translations(word_id)
local ret = {}
-- FIXME: Fetch other translations if primary
-- language is not available
local cur = assert(con:execute(string.format([[
SELECT tl FROM translations
- WHERE word_id = %d AND lang = "%s"
- ]], word_id, lang)))
+ WHERE word_id = %d AND lang = '%s'
+ ]], word_id, con:escape(lang))))
local row = cur:fetch({}, "a")
while row do
- table.insert(ret, row.tl)
+ -- NOTE: One entry might contain many comma-separated
+ -- translations
+ for word in lutf8.gmatch(row.tl..", ", "(.-), ") do
+ table.insert(ret, word)
+ end
row = cur:fetch({}, "a")
end
cur:close()
@@ -223,13 +332,32 @@ function get_translations(word_id)
return ret
end
--- NOTE: This lets SQL strip the accent char from the input, which
--- allows users to cut and paste from generated output while we don't
--- have to deal with Unicode in Lua.
+-- Format reference to row from the words-table.
+-- FIXME: Not printed bold since bold text and accents
+-- don't work together (URxvt).
+local function get_reference(word_row)
+ return map_accented(word_row.accented or word_row.bare)..
+ '('..(word_row.type or "other")..')'
+end
+
+-- NOTE: This strips the accent char, so users can cut and paste from
+-- generated output.
+-- This is done from Lua, since the right-hand side of GLOB should be a constant
+-- to allow optimizations:
+-- https://www.sqlite.org/optoverview.html#the_like_optimization
+--
+-- TODO: Double-check whether the GLOB is actually optimized.
+-- Theoretically, we need COLLATE BINARY for that.
+--
+-- FIXME: Case-folding UTF8 / Collating is not supported by SQLite3.
+-- If we want to support case-insensitive matching, it is mandatory, though.
+-- Could be done using the ICU extension:
+-- https://www.sqlite.org/src/artifact?ci=trunk&filename=ext/icu/README.txt
local cur = assert(con:execute(string.format([[
- SELECT bare, accented, type, words.id AS word_id
- FROM words WHERE bare = REPLACE("%s", CHAR(0x0301), "")
-]], search_word)))
+ SELECT bare AS completions, * FROM words
+ WHERE LIKELY(disabled = 0) AND bare GLOB '%s'
+ ORDER BY rank
+]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
local rows = {}
local row
@@ -240,17 +368,189 @@ until not row
cur:close()
+if not verbatim then
+ --[==[
+ -- FIXME: These queries are tooo sloooow! Perhaps that's why the openrussion.org
+ -- website does not allow searching by declension prefixes.
+ -- This is because of the need for string-concatenations for every possible word
+ -- and because the GLOBbing cannot be optimized, even in the most common cases.
+ -- FIXME: This does not find braced-terms. Glob patterns are simply not powerful
+ -- enough to express "optional brace".
+ -- We'd probably need regexp for that.
+ cur = assert(con:execute(string.format([[
+ SELECT REPLACE(temp, "'", "") AS completions, words.* FROM words JOIN (
+ -- Search word might be a noun or adjective declension
+ SELECT nom||","||gen||","||dat||","||acc||","||inst||","||prep AS temp, word_id
+ FROM declensions
+ UNION
+ -- Search word might be a special adjective inflection
+ SELECT comparative||","||superlative||","||
+ short_m||","||short_f||","||short_n||","||short_pl AS temp, word_id
+ FROM adjectives
+ UNION
+ -- Search word might be a verb imperative, past form or conjugation
+ SELECT imperative_sg||","||imperative_pl||","||past_m||","||past_f||","||past_n||","||past_pl||
+ sg1||","||sg2||","||sg3||","||pl1||","||pl2||","||pl3 AS temp, verbs.word_id
+ FROM verbs LEFT JOIN conjugations ON presfut_conj_id = conjugations.id
+ ) ON words.id = word_id
+ WHERE LIKELY(disabled = 0) AND ","||completions||"," GLOB '*,%s,*'
+ ORDER BY rank
+ ]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
+
+ -- This is an alternative to the above query.
+ -- It eliminates the concatenations, but has to iterate many tables redundantly.
+ -- Effectively it is twice as slow as the above query...
+ cur = assert(con:execute(string.format([[
+ SELECT REPLACE(temp, "'", "") AS completions, words.* FROM words JOIN (
+ -- Search word might be a noun or adjective declension
+ SELECT nom AS temp, word_id FROM declensions
+ UNION ALL
+ SELECT gen AS temp, word_id FROM declensions
+ UNION ALL
+ SELECT dat AS temp, word_id FROM declensions
+ UNION ALL
+ SELECT acc AS temp, word_id FROM declensions
+ UNION ALL
+ SELECT inst AS temp, word_id FROM declensions
+ UNION ALL
+ SELECT prep AS temp, word_id FROM declensions
+ UNION ALL
+ -- Search word might be a special adjective inflection
+ SELECT comparative AS temp, word_id FROM adjectives
+ UNION ALL
+ SELECT superlative AS temp, word_id FROM adjectives
+ UNION ALL
+ SELECT short_m AS temp, word_id FROM adjectives
+ UNION ALL
+ SELECT short_f AS temp, word_id FROM adjectives
+ UNION ALL
+ SELECT short_n AS temp, word_id FROM adjectives
+ UNION ALL
+ SELECT short_pl AS temp, word_id FROM adjectives
+ UNION ALL
+ -- Search word might be a verb imperative or past form
+ SELECT imperative_sg AS temp, word_id FROM verbs
+ UNION ALL
+ SELECT imperative_pl AS temp, word_id FROM verbs
+ UNION ALL
+ SELECT past_m AS temp, word_id FROM verbs
+ UNION ALL
+ SELECT past_f AS temp, word_id FROM verbs
+ UNION ALL
+ SELECT past_n AS temp, word_id FROM verbs
+ UNION ALL
+ SELECT past_pl AS temp, word_id FROM verbs
+ UNION ALL
+ -- Search word might be a verb conjugation
+ SELECT sg1 AS temp, word_id FROM conjugations
+ UNION ALL
+ SELECT sg2 AS temp, word_id FROM conjugations
+ UNION ALL
+ SELECT sg3 AS temp, word_id FROM conjugations
+ UNION ALL
+ SELECT pl1 AS temp, word_id FROM conjugations
+ UNION ALL
+ SELECT pl2 AS temp, word_id FROM conjugations
+ UNION ALL
+ SELECT pl3 AS temp, word_id FROM conjugations
+ ) ON words.id = word_id
+ WHERE LIKELY(disabled = 0) AND completions GLOB '%s'
+ ORDER BY rank
+ ]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
+ ]==]
+
+ -- This query uses a new `bare_inflections` table, since all queries
+ -- using existing tables (see above) are way too slow, especially for
+ -- autocompletions.
+ -- NOTE: The right-hand side of GLOB must be a constant, so that it can be
+ -- optimized using the index.
+ cur = assert(con:execute(string.format([[
+ SELECT bare_inflections.bare AS completions, words.*
+ FROM words JOIN bare_inflections ON words.id = word_id
+ WHERE LIKELY(disabled = 0) AND completions GLOB '%s'
+ ORDER BY rank
+ ]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
+
+ repeat
+ row = cur:fetch({}, "a")
+ table.insert(rows, row)
+ until not row
+
+ cur:close()
+end
+
+-- Only if we do not find a Russian word, we try to find a translation.
+-- This is not wrapped with the above query into one using a LEFT JOIN since
+-- two queries are significantly faster - probably because of having to perform less
+-- string concatenations.
+if #rows == 0 then
+ -- NOTE: The translation entry frequently contains a comma-separated
+ -- list of translations
+ --
+ -- FIXME: Case folding only works for ASCII, which should be sufficient for
+ -- German/English text (almost)...
+ -- FIXME: The string concatenation is a real slow-down and the GLOB cannot
+ -- be optimized.
+ -- Perhaps the translations should be in their own (new) indexed table.
+ cur = assert(con:execute(string.format([[
+ SELECT %s(", "||tl||", ") AS completions, words.*
+ FROM words JOIN translations ON words.id = word_id
+ WHERE LIKELY(disabled = 0) AND lang = '%s' AND completions GLOB %s('*, %s, *')
+ ORDER BY rank
+ ]], verbatim and "" or "LOWER", con:escape(lang), verbatim and "" or "LOWER", con:escape(search_word))))
+
+ repeat
+ row = cur:fetch({}, "a")
+ table.insert(rows, row)
+ until not row
+
+ cur:close()
+end
+
+if auto_complete then
+ -- FIXME: See above for notes on case-folding
+ local search_word_bare = lutf8.gsub(search_word, ACCENT, "")
+ search_word_bare = verbatim and search_word_bare or search_word_bare:lower()
+
+ for _, row in ipairs(rows) do
+ -- NOTE: This code is reused for Russian base words, inflections and translated lookups,
+ -- so there is a common `completions` column.
+ -- Russian words can be treated like single-word translations.
+ -- Terms in this column can be comma-separated with and without spaces and
+ -- there may be braces.
+ for word in lutf8.gmatch(row.completions..",", " *%(?(.-)%)?,") do
+ if glob(search_word, word) then
+ io.stdout:write(search_words[#search_words],
+ lutf8.sub(word, lutf8.len(search_word_bare)), "\n")
+ end
+ end
+ end
+
+ os.exit(true)
+end
+
if #rows == 0 then
io.stderr:write('Word "', search_word, '" not found!\n')
os.exit(false)
end
-if #rows == 1 then
- row = rows[1]
+-- Filter out duplicates
+local word_ids = {}
+local unique_rows = {}
+
+for _, row in ipairs(rows) do
+ if not word_ids[row.id] then
+ table.insert(unique_rows, row)
+ word_ids[row.id] = true
+ end
+end
+
+if #unique_rows == 1 then
+ row = unique_rows[1]
else
- for i, row in ipairs(rows) do
+ for i, row in ipairs(unique_rows) do
local word_accented = row.accented or row.bare
- local tl = get_translations(row.word_id)
+ local tl = get_translations(row.id)
io.stdout:write(i, ") ", map_accented(word_accented))
if #tl > 0 then io.stdout:write(" (", table.concat(tl, ", "), ")") end
@@ -258,68 +558,143 @@ else
end
repeat
- io.stdout:write("Show [1..", #rows, ", press enter to cancel]? "):flush()
+ io.stdout:write("Show [1..", #unique_rows, ", press enter to cancel]? "):flush()
local choice = io.stdin:read():lower()
if choice == "" or choice == "q" then os.exit() end
- row = rows[tonumber(choice)]
+ row = unique_rows[tonumber(choice)]
until row
end
-local word_id = row.word_id
-local word_type = row.type or "other"
+local word_id = row.id
-- NOTE: Some words (e.g. personal pronouns) apparently do not
-- come with accents!?
local word_accented = row.accented or row.bare
+local word_derived_from = row.derived_from_word_id
+local word_audio = row.audio
+local word_usage = row["usage_"..lang]
+local word_type = row.type or "other"
-- Open stream only now, after no more messages have to be written to
-- stdout/stderr.
out_stream = assert(use_stdout and io.stdout or io.popen("man /dev/stdin", "w"))
+-- NOTE: The headers and footers shouldn't contain critical information
+-- since they might not be printed at all.
out_stream:write('.\\" t\n',
- '.TH "', search_word, '" "', word_type, '"\n')
+ '.TH "', row.bare, '" "', word_type, '" "')
+if row.rank then
+ out_stream:write('#', row.rank, row.level and ' ('..row.level..')' or '')
+else
+ out_stream:write(row.level)
+end
+out_stream:write('" "openrussian.lua" "openrussian.org"\n')
+
+--
+-- Generic WORD section with translation.
+--
+out_stream:write('.SH WORD\n',
+ map_accented(word_accented))
+local tl = get_translations(word_id)
+if #tl > 0 then
+ out_stream:write(' \\-\\- ', table.concat(tl, ', '))
+end
+out_stream:write('\n')
--
-- Word-specific sections
+-- NOTE: word_accented is required only for format.noun() and could be
+-- avoided altogether.
--
format[word_type](word_id, word_accented)
--
-- Generic sections
--
-local tl = get_translations(word_id)
-if #tl > 0 then
- out_stream:write('.SH TRANSLATION\n',
- table.concat(tl, ', '), '\n')
+if word_usage then
+ out_stream:write('.SH USAGE\n',
+ word_usage, '\n')
+end
+
+-- FIXME: Perhaps this should rather be part of the SEE ALSO section
+if word_derived_from then
+ cur = assert(con:execute(string.format([[
+ SELECT bare, accented, type FROM words
+ WHERE LIKELY(disabled = 0) AND id = %d
+ ]], word_derived_from)))
+ row = assert(cur:fetch({}, "a"))
+ cur:close()
+
+ out_stream:write('.SH DERIVED FROM\n',
+ get_reference(row), '\n')
end
--
--- NOTE: There can be many examples, so print them last.
+-- NOTE: There can be many examples, so print them late.
--
cur = assert(con:execute(string.format([[
SELECT ru, start, length, tl
FROM sentences_words JOIN sentences ON sentence_id = sentences.id
- WHERE word_id = %d AND lang = "%s"
-]], word_id, lang)))
+ WHERE word_id = %d AND lang = '%s'
+]], word_id, con:escape(lang))))
row = cur:fetch({}, "a")
if row then
out_stream:write('.SH EXAMPLES\n')
repeat
- -- FIXME: Highlight search word in sentences.
- -- start/length are apparently in characters
- -- instead of bytes.
- --[[
- local ru_hl = row.ru:sub(1, row.start)..'\\fI'..
- row.ru:sub(row.start+1, row.start+1+row.length)..'\\fP'..
- row.ru:sub(row.start+1+row.length+1)
- ]]
+ -- FIXME: The accent is not always available in the default
+ -- italic font when formatting for PDF.
+ local ru_hl = lutf8.sub(row.ru, 1, row.start)..'\\fI'..
+ lutf8.sub(row.ru, row.start+1, row.start+1+row.length-1)..'\\fP'..
+ lutf8.sub(row.ru, row.start+1+row.length)
+
out_stream:write('.TP\n',
- map_accented(row.ru), '\n',
+ map_accented(ru_hl), '\n',
row.tl, '\n')
row = cur:fetch({}, "a")
until not row
+end
+cur:close()
+-- Audio recordings might be useful occasionally, but this is an offline/terminal
+-- application, so it makes sense to print them last (like URLs in manpages).
+--
+-- NOTE: There is an UE man-macro, but it doesn't seem to be very helpful here and
+-- seems to bring no advantages when formatting as a PDF.
+-- It could be typset in the default fixed-width font (\fC), but it does not contain
+-- cyrillic characters, so we don't do that either.
+if word_audio then
+ out_stream:write('.SH AUDIO\n',
+ word_audio, '\n')
end
+
+-- Disable adjusting (space-stretching) for the related-word lists.
+-- Don't forget to enable this again if something follows these sections.
+out_stream:write('.na\n')
+
+-- NOTE: The results are grouped by relation, so that they can be
+-- easily printed in one section per relation.
+-- Unfortunately, we cannot define custom collating sequences with LuaSQL.
+-- FIXME: Print this under a single SEE ALSO master section?
+-- FIXME: Results should perhaps be ordered by `type`?
+cur = assert(con:execute(string.format([[
+ SELECT bare, accented, type, relation
+ FROM words_rels JOIN words ON rel_word_id = words.id
+ WHERE LIKELY(disabled = 0) AND words_rels.word_id = %d
+ ORDER BY relation, rank
+]], word_id)))
+
+local cur_relation
+row = cur:fetch({}, "a")
+while row do
+ if cur_relation ~= row.relation then
+ cur_relation = row.relation
+ out_stream:write('.SH ', cur_relation:upper(), '\n')
+ end
+ out_stream:write(get_reference(row))
+ row = cur:fetch({}, "a")
+ out_stream:write(row and row.relation == cur_relation and ', ' or '\n')
+end
+
cur:close()
--
diff --git a/postprocess.sql b/postprocess.sql
new file mode 100644
index 0000000..5976cb1
--- /dev/null
+++ b/postprocess.sql
@@ -0,0 +1,78 @@
+-- Drop unused tables, hopefully saving a few megabytes.
+-- `expressions_words` is for excercises???
+DROP TABLE expressions_words;
+-- `categories_words2` references `categories` but is not in the database!?
+DROP TABLE categories_words2;
+
+-- This table is especially added to aid lookups and autocompletions on all
+-- possible inflections.
+-- This is significantly faster than using the existing tables.
+--
+-- FIXME: Some of the columns contain multiple comma-separated values
+-- (sometimes in braces), so we probably need a Lua script for initializion.
+-- TODO: Translations should probably be in this table, or a separate one as well.
+-- But see above.
+CREATE TABLE bare_inflections (`word_id` INTEGER NOT NULL, `bare` VARCHAR(100) NOT NULL COLLATE BINARY);
+-- Indexing the `bare` column allows ultra-fast lookups even via GLOB
+-- in most common cases. For this optimization to work, the column must
+-- also be BINARY collated. See also:
+-- https://www.sqlite.org/optoverview.html#the_like_optimization
+CREATE INDEX idx_bare_inflections_bare ON bare_inflections (`bare`);
+INSERT INTO bare_inflections SELECT word_id, REPLACE(temp, "'", "") AS bare
+FROM (
+ -- Search word might be a noun or adjective declension
+ SELECT word_id, nom AS temp FROM declensions
+ UNION ALL
+ SELECT word_id, gen AS temp FROM declensions
+ UNION ALL
+ SELECT word_id, dat AS temp FROM declensions
+ UNION ALL
+ SELECT word_id, acc AS temp FROM declensions
+ UNION ALL
+ SELECT word_id, inst AS temp FROM declensions
+ UNION ALL
+ SELECT word_id, prep AS temp FROM declensions
+ UNION ALL
+ -- Search word might be a special adjective inflection
+ SELECT word_id, comparative AS temp FROM adjectives
+ UNION ALL
+ SELECT word_id, superlative AS temp FROM adjectives
+ UNION ALL
+ SELECT word_id, short_m AS temp FROM adjectives
+ UNION ALL
+ SELECT word_id, short_f AS temp FROM adjectives
+ UNION ALL
+ SELECT word_id, short_n AS temp FROM adjectives
+ UNION ALL
+ SELECT word_id, short_pl AS temp FROM adjectives
+ UNION ALL
+ -- Search word might be a verb imperative or past form
+ SELECT word_id, imperative_sg AS temp FROM verbs
+ UNION ALL
+ SELECT word_id, imperative_pl AS temp FROM verbs
+ UNION ALL
+ SELECT word_id, past_m AS temp FROM verbs
+ UNION ALL
+ SELECT word_id, past_f AS temp FROM verbs
+ UNION ALL
+ SELECT word_id, past_n AS temp FROM verbs
+ UNION ALL
+ SELECT word_id, past_pl AS temp FROM verbs
+ UNION ALL
+ -- Search word might be a verb conjugation
+ SELECT word_id, sg1 AS temp FROM conjugations
+ UNION ALL
+ SELECT word_id, sg2 AS temp FROM conjugations
+ UNION ALL
+ SELECT word_id, sg3 AS temp FROM conjugations
+ UNION ALL
+ SELECT word_id, pl1 AS temp FROM conjugations
+ UNION ALL
+ SELECT word_id, pl2 AS temp FROM conjugations
+ UNION ALL
+ SELECT word_id, pl3 AS temp FROM conjugations
+)
+WHERE bare <> "";
+
+-- Saves a few megabytes
+VACUUM;