major changes (unfortunately not done systematically)

* keep the openrussian-sql.zip to avoid bitrot (sudden breakage when building the project against a recent DB) * allow lookups against inflections. A special table `bare_inflections` was added to the DB in order to speed this up. * authentic autocompletions taking every parameter into account, using a magic `-C` parameter * language selection via locale or -L<lang> * translation lookup * allow multiple search terms on the command line without escaping * all SQL strings are properly escaped now to avoid (accidental) code insertion * luautf8 is mandatory now and used much more, which is safer than handling Unicode strings with the builtin functions * ignore "disable" database entries * added option -V (verbatim) * more protections against empty fields * print "DERIVED FROM", "AUDIO", "USAGE" and various word relation sections * print word ranking and level (A1-C2...)
author: Robin Haberkorn <robin.haberkorn@googlemail.com> 2019-04-18 23:20:46 +0300
committer: Robin Haberkorn <robin.haberkorn@googlemail.com> 2019-04-18 23:20:46 +0300
commit: 481c69be02c95fbf20567d8de617315484e2d692 (patch)
tree: 71bf406543586f278a5e779a0befa57f0fe529e9
parent: 07d4fa1fe85eba554becb20bb4a872acb320a423 (diff)
download: openrussian-cli-481c69be02c95fbf20567d8de617315484e2d692.tar.gz
6 files changed, 542 insertions, 97 deletions
diff --git a/.gitignore b/.gitignore
index 8308aa1..ff004d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 openrussian
-openrussian-sql.zip
 openrussian-sqlite3.db
+# This file is currently versioned, so it should not be ignored.
+#openrussian-sql.zip
diff --git a/Makefile b/Makefile
index 8743043..a6b777f 100644
--- a/Makefile
+++ b/Makefile
@@ -13,16 +13,20 @@ openrussian : openrussian.lua
 	chmod a+x $@
 
 # Marking the ZIP intermediate makes sure it is automatically deleted after
-# generating the SQLITE database while we don't always re-download it.
+# generating the SQLite database while we don't always re-download it.
 # Effectively, it will behave like a temporary file.
-.INTERMEDIATE: openrussian-sql.zip
+# NOTE: This is disabled for the time being since the database schema changes
+# from time to time, so a rebuild could easily break the script.
+# Instead, we add the file to Git, so every clone is guaranteed to contain
+# a database matching the openrussian.lua script.
+#.INTERMEDIATE: openrussian-sql.zip
 openrussian-sql.zip:
 	wget -O $@ 'https://en.openrussian.org/downloads/openrussian-sql.zip'
 
-# NOTE: VACUUMing the database saves a few megabytes
-openrussian-sqlite3.db : openrussian-sql.zip mysql2sqlite
+openrussian-sqlite3.db : openrussian-sql.zip mysql2sqlite postprocess.sql
+	$(RM) $@
 	unzip -p $< openrussian.sql | ./mysql2sqlite - | sqlite3 $@
-	sqlite3 $@ VACUUM
+	sqlite3 $@ -batch <postprocess.sql
 
 # NOTE: Installation of the Bash completions depends on the Debain bash-completion
 # package being installed or something similar
@@ -33,4 +37,4 @@ install : openrussian openrussian-sqlite3.db openrussian-completion.bash
 	cp openrussian-completion.bash $(DESTDIR)$(COMPLETIONSDIR)/openrussian
 
 clean:
-	$(RM) -f openrussian openrussian-sql.zip openrussian-sqlite3.db
+	$(RM) openrussian openrussian-sql.zip openrussian-sqlite3.db
diff --git a/openrussian-completion.bash b/openrussian-completion.bash
index fd27c06..9892d65 100644
--- a/openrussian-completion.bash
+++ b/openrussian-completion.bash
@@ -1,33 +1,20 @@
 #/bin/bash
 
-# NOTE: Uses SQL for matching instead of compgen, since it's probably not
-# a good idea to always retrieve all the words, although this happens anyway
-# when completing an empty word.
-
 _openrussian_completions()
 {
-	# NOTE: sqlite3's command-line tool does not provide a way to properly
-	# embed strings, so we try to escape it here.
-	WORD=$(echo -n "$2" | sed 's/"/""/g')
-
-	# This is a rather convoluted way of writing
-	# SELECT bare FROM words WHERE bare LIKE "${WORD}%";
-	# but allowing $WORD to contain accentuation characters, which can easily
-	# happen when cutting/pasting words from the openrussian.lua manpages or the
-	# internet.
-	# NOTE: This is merely a workaround since all completions will begin with
-	# $WORD including accents and end without accents, so the suggested completions
-	# will likely be with wrong accents.
-	# It seems to be impossible, at least in Bash, to rubout $WORD first.
-	SQL=$(printf 'SELECT "%s" || SUBSTR(bare, LENGTH(REPLACE("%s", "\u0301", ""))+1)
-	              FROM words WHERE bare LIKE REPLACE("%s%%", "\u0301", "")' \
-	             "$WORD" "$WORD" "$WORD")
-
-	# Calculate database path based on the installation path of the `openrussian`
-	# CLI tool. This way, we can avoid preprocessing the script during installation.
-	PREFIX=$(dirname $(which openrussian))/..
+	# Autocompletion of all search terms can be outsourced
+	# to the openrussian.lua script, which has the advantage
+	# of taking all command line flags into account.
+	#
+	# NOTE: mapfile is used to properly capture lines containing
+	# whitespace
+	# FIXME: Does not work if the last token contains spaces
+	# FIXME: Also fails when trying to complete multiple tokens
+	# (in order to avoid white-space quoting).
+	mapfile -t COMPREPLY < <(openrussian -C "${COMP_WORDS[@]:1}" 2>/dev/null)
 
-	COMPREPLY=($(sqlite3 "$PREFIX/share/openrussian/openrussian-sqlite3.db" "$SQL"))
+	# NOTE: openrussian.lua currently does not complete switch-names.
+	COMPREPLY+=($(compgen -W "-Len -Lde -V -p" -- "$2"))
 }
 
 complete -F _openrussian_completions openrussian
diff --git a/openrussian-sql.zip b/openrussian-sql.zip
new file mode 100644
index 0000000..585e773
--- /dev/null
+++ b/openrussian-sql.zip
diff --git a/openrussian.lua b/openrussian.lua
index b75de7f..bd25930 100755
--- a/openrussian.lua
+++ b/openrussian.lua
@@ -1,29 +1,69 @@
 #!/usr/bin/lua5.2
 
 local driver = require "luasql.sqlite3"
+local lutf8 = require "lua-utf8"
+
+local ACCENT = lutf8.char(0x0301) -- Accent combining character
+
+local lang = os.setlocale(nil, "ctype"):match("^([^_]+)")
+
+local search_words = {}
 
 local function usage(stream)
-	stream:write("Usage: ", arg[0], " [-p] <word>\n")
+	stream:write("Usage: ", arg[0], " [-L<lang>] [-V] [-p] <pattern...>\n",
+	             "\t-L<lang>    Set language to <lang> (currently en or de, guessed from locale)\n",
+	             "\t-V          Verbatim matching (no case folding and inflections)\n",
+	             "\t-p          Print Troff code to stdout\n")
 end
 
 for i = 1, #arg do
 	if arg[i]:sub(1, 1) == "-" then
-		if arg[i]:sub(2) == "p" then
+		local opt = arg[i]:sub(2)
+
+		if opt:sub(1, 1) == "L" then
+			if #opt > 1 then
+				lang = opt:sub(2)
+			elseif i == #arg then
+				usage(io.stderr)
+				os.exit(false)
+			else
+				lang = arg[i+1]
+				i = i + 1
+			end
+		elseif opt == "V" then
+			verbatim = true
+		elseif opt == "p" then
 			use_stdout = true
+		elseif opt == "C" then
+			-- This is a "secret" command used for implementing
+			-- auto-completions.
+			-- It will usually be the first argument.
+			auto_complete = true
 		else
 			usage(io.stderr)
 			os.exit(false)
 		end
 	else
-		search_word = arg[i]
+		table.insert(search_words, arg[i])
 	end
 end
 
-if not search_word then
+if #search_words == 0 then
 	usage(io.stderr);
 	os.exit(false)
 end
 
+-- Allowing multiple arguments to be concat into the search words
+-- is useful when searching for a translation which may contain
+-- spaces without quoting the entire search term.
+local search_word = table.concat(search_words, " ")..
+                    (auto_complete and "*" or "")
+
+-- FIXME: Currently only English and German are actually
+-- contained in the database, but this might change.
+-- Perhaps query the availability dynamically.
+if lang ~= "en" and lang ~= "de" then lang = "en" end
+
 local function dirname(path)
 	return path:match("^(.*)/.+$") or "."
 end
@@ -37,35 +77,71 @@ local database = PREFIX.."/share/openrussian/openrussian-sqlite3.db"
 if not io.open(database) then database = "openrussian-sqlite3.db" end
 
 local out_stream
-local lang = "en"
 
 local env = assert(driver.sqlite3())
 local con = assert(env:connect(database))
 
+-- A SQL-compatible globber.
+-- Necessary since globbing is usually done as part of the
+-- SQL query.
+--
+-- NOTE: This may be reimplemented more efficiently by translating
+-- the glob pattern to a Lua pattern.
+-- Unfortunately, the Glob pattern syntax appears to be undefined,
+-- probably because it defaults to the system glob.
+--
+-- Alternatively, we might override the MATCH function with Lua patterns
+-- and use MATCH instead of GLOB, but this might be inefficient.
+-- In order to make use of the query optimizer, we must either use
+-- LIKE or GLOB.
+--
+-- Yet another alternative might be to parse all translations into
+-- a separate index, speeding up translation lookups and avoiding
+-- the need for globbing in Lua here.
+function glob(pattern, str)
+	local cur = assert(con:execute(string.format([[
+		SELECT '%s' GLOB '%s'
+	]], con:escape(str), con:escape(pattern))))
+	local row = assert(cur:fetch())
+	cur:close()
+
+	return row ~= 0
+end
+
 -- Turns a character followed by apostroph into a combined
 -- accented character.
 -- NOTE: This encodes the accent (u0301) in bytes, so it can be
 -- used for printing to stdout or into Troff code.
 local function map_accented(str)
-	return (str:gsub("'", "\xCC\x81"))
+	return (lutf8.gsub(str, "'", ACCENT))
 end
--- FIXME: This does not work for tables since tbl will count the
+-- FIXME: map_accented() does not work for tables since tbl will count the
 -- combined character as two. Theoretically, Groff has composite characters
 -- like \u[u043E_0301] but they don't work for all the cyrillic
 -- vocals.
+-- If we really wanted to, we could replace every accented character
+-- with an inline macro that is defined at Troff runtime depending on the
+-- output device, so we could get accented characters in PDF tables at least.
 local function map_tbl(str)
-	return (str:gsub("(..)'", "\\fI%1\\fP"))
+	return (lutf8.gsub(str, "(.)'", "\\fI%1\\fP"))
 end
 
+-- FIXME: Apparently, there are entries without declension or empty declension
+-- entries, e.g. kosha4ij.
+-- These should be detected and the entire section should be omitted.
 local function format_declension(tag, decl_id, short_form)
 	local cur = assert(con:execute(string.format([[
 		SELECT * FROM declensions WHERE id = %d
 	]], decl_id)))
 	local row = assert(cur:fetch({}, "a"))
 	cur:close()
-	out_stream:write(tag, ';', map_tbl(row.nom), ';', map_tbl(row.gen), ';',
-	                 map_tbl(row.dat), ';', map_tbl(row.acc), ';',
-	                 map_tbl(row.inst), ';', map_tbl(row.prep))
+
+	out_stream:write(tag, ';', map_tbl(row.nom or "-"), ';',
+	                 map_tbl(row.gen or "-"), ';',
+	                 map_tbl(row.dat or "-"), ';',
+	                 map_tbl(row.acc or "-"), ';',
+	                 map_tbl(row.inst or "-"), ';',
+	                 map_tbl(row.prep or "-"))
 	if short_form then out_stream:write(';', map_tbl(short_form)) end
 	out_stream:write('\n')
 end
@@ -83,11 +159,14 @@ function format.noun(word_id, accented)
 	local cur = assert(con:execute(string.format([[
 		SELECT * FROM nouns WHERE word_id = %d
 	]], word_id)))
-	local row = assert(cur:fetch({}, "a"))
+	local row = cur:fetch({}, "a")
 	cur:close()
 
-	out_stream:write('.SH WORD\n',
-	                 map_accented(accented), ' \\-\\- noun, ')
+	-- NOTE: This can probably happen as with any other word category
+	-- (example?)
+	if not row then return end
+
+	out_stream:write('.SH GENDER\n')
 	if row.gender and row.gender ~= "" then
 		local genders = {m = "male", f = "female", n = "neuter"}
 		out_stream:write(genders[row.gender], ', ')
@@ -95,12 +174,16 @@ function format.noun(word_id, accented)
 	out_stream:write(row.animate == 1 and 'animate' or 'inanimate', '\n')
 
 	if row.partner and row.partner ~= "" then
-		-- FIXME: What exactly is a noun "partner"?
-		-- Seems to be used mostly for male/female pairs etc.
+		-- NOTE: Noun "partners" seem to be male/female counterparts.
+		-- FIXME: It would also be nice to include an accented version,
+		-- but since the DB lists the partner as a string instead of
+		-- word_id, finding the right entry could be unreliable 
 		out_stream:write('.SH PARTNER\n',
 		                 row.partner, '\n')
 	end
 
+	-- FIXME: Rotate this table (only two columns: singular and plural).
+	-- Lines should be short.
 	out_stream:write('.SH DECLENSION\n',
 	                 '.TS\n',
 	                 'allbox,tab(;);\n',
@@ -128,12 +211,18 @@ function format.adjective(word_id, accented)
 	local cur = assert(con:execute(string.format([[
 		SELECT * FROM adjectives WHERE word_id = %d
 	]], word_id)))
-	local row = assert(cur:fetch({}, "a"))
+	local row = cur:fetch({}, "a")
 	cur:close()
 
-	out_stream:write('.SH WORD\n',
-	                 map_accented(accented), ' \\-\\- adjective\n')
+	-- NOTE: Seldomly (e.g. nesomnenno), there is no entry in adjectives
+	if not row then return end
+
+	--out_stream:write('.SH CATEGORY\n',
+	--                 'adjective\n')
 
+	-- FIXME: Rotate this table (columns will be gender+plural).
+	-- Lines should be short.
+	-- FIXME: Short form not always present
 	out_stream:write('.SH DECLENSION\n',
 	                 '.TS\n',
 	                 'allbox,tab(;);\n',
@@ -146,36 +235,52 @@ function format.adjective(word_id, accented)
 	format_declension('Plural', row.decl_pl_id, row.short_pl)
 	out_stream:write('.TE\n')
 
-	if row.comparative then
+	if row.comparative and row.comparative ~= "" then
 		out_stream:write('.SH COMPARATIVE\n',
 		                 map_accented(row.comparative), '\n')
 	end
 
-	if row.superlative then
+	if row.superlative and row.superlative ~= "" then
 		out_stream:write('.SH SUPERLATIVE\n',
 		                 map_accented(row.superlative), '\n')
 	end
 end
 
+-- NOTE: There is no separate table for adverbs
+-- Currently, we wouldn't print more than the category, which is also in the
+-- header, so it is omitted.
+function format.adverb(word_id, accented)
+	--out_stream:write('.SH CATEGORY\n',
+	--                 'adverb\n')
+end
+
 function format.verb(word_id, accented)
 	local cur = assert(con:execute(string.format([[
 		SELECT * FROM verbs JOIN conjugations ON verbs.presfut_conj_id = conjugations.id
 		WHERE verbs.word_id = %d
 	]], word_id)))
-	local row = assert(cur:fetch({}, "a"))
+	local row = cur:fetch({}, "a")
 	cur:close()
 
-	out_stream:write('.SH WORD\n',
-	                 map_accented(accented), ' \\-\\- verb')
-	if row.aspect then out_stream:write(', ', row.aspect) end
-	out_stream:write('\n')
+	-- NOTE: Seldomly (e.g. est' -- to be), there is no entry in verbs
+	if not row then return end
+
+	if row.aspect then
+		out_stream:write('.SH ASPECT\n',
+		                 row.aspect, '\n')
+	end
 
 	if row.partner and row.partner ~= "" then
-		-- NOTE: Verb partners seem to be the aspect partners
+		-- NOTE: Verb partners seem to be the aspect partners.
+		-- They are either comma or semicolon separated.
+		-- FIXME: It would also be nice to include an accented version,
+		-- but since the DB lists the partner as a string instead of
+		-- word_id, finding the right entry could be unreliable 
 		out_stream:write('.SH PARTNER\n',
-		                 row.partner, '\n')
+		                 lutf8.gsub(row.partner, "[;,]", ", "), '\n')
 	end
 
+	-- FIXME: Conjugation sometimes empty (e.g. widat')
 	-- FIXME: Can we assume that verbs without specified aspect are always
 	-- perfective?
 	out_stream:write('.SH ', row.aspect == "imperfective" and 'PRESENT\n' or 'FUTURE\n',
@@ -200,22 +305,26 @@ function format.verb(word_id, accented)
 end
 
 function format.other(word_id, accented)
-	out_stream:write('.SH WORD\n',
-	                 map_accented(accented), '\n')
+	--out_stream:write('.SH CATEGORY\n',
+	--                 'other\n')
 end
 
-function get_translations(word_id)
+local function get_translations(word_id)
 	local ret = {}
 
 	-- FIXME: Fetch other translations if primary
 	-- language is not available
 	local cur = assert(con:execute(string.format([[
 		SELECT tl FROM translations
-		WHERE word_id = %d AND lang = "%s"
-	]], word_id, lang)))
+		WHERE word_id = %d AND lang = '%s'
+	]], word_id, con:escape(lang))))
 	local row = cur:fetch({}, "a")
 	while row do
-		table.insert(ret, row.tl)
+		-- NOTE: One entry might contain many comma-separated
+		-- translations
+		for word in lutf8.gmatch(row.tl..", ", "(.-), ") do
+			table.insert(ret, word)
+		end
 		row = cur:fetch({}, "a")
 	end
 	cur:close()
@@ -223,13 +332,32 @@ function get_translations(word_id)
 	return ret
 end
 
--- NOTE: This lets SQL strip the accent char from the input, which
--- allows users to cut and paste from generated output while we don't
--- have to deal with Unicode in Lua.
+-- Format reference to row from the words-table.
+-- FIXME: Not printed bold since bold text and accents
+-- don't work together (URxvt).
+local function get_reference(word_row)
+	return map_accented(word_row.accented or word_row.bare)..
+	       '('..(word_row.type or "other")..')'
+end
+
+-- NOTE: This strips the accent char, so users can cut and paste from
+-- generated output.
+-- This is done from Lua, since the right-hand side of GLOB should be a constant
+-- to allow optimizations:
+-- https://www.sqlite.org/optoverview.html#the_like_optimization
+--
+-- TODO: Double-check whether the GLOB is actually optimized.
+-- Theoretically, we need COLLATE BINARY for that.
+--
+-- FIXME: Case-folding UTF8 / Collating is not supported by SQLite3.
+-- If we want to support case-insensitive matching, it is mandatory, though.
+-- Could be done using the ICU extension:
+-- https://www.sqlite.org/src/artifact?ci=trunk&filename=ext/icu/README.txt
 local cur = assert(con:execute(string.format([[
-	SELECT bare, accented, type, words.id AS word_id
-	FROM words WHERE bare = REPLACE("%s", CHAR(0x0301), "")
-]], search_word)))
+	SELECT bare AS completions, * FROM words
+	WHERE LIKELY(disabled = 0) AND bare GLOB '%s'
+	ORDER BY rank
+]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
 
 local rows = {}
 local row
@@ -240,17 +368,189 @@ until not row
 
 cur:close()
 
+if not verbatim then
+	--[==[
+	-- FIXME: These queries are tooo sloooow! Perhaps that's why the openrussion.org
+	-- website does not allow searching by declension prefixes.
+	-- This is because of the need for string-concatenations for every possible word
+	-- and because the GLOBbing cannot be optimized, even in the most common cases.
+	-- FIXME: This does not find braced-terms. Glob patterns are simply not powerful
+	-- enough to express "optional brace".
+	-- We'd probably need regexp for that.
+	cur = assert(con:execute(string.format([[
+		SELECT REPLACE(temp, "'", "") AS completions, words.* FROM words JOIN (
+			-- Search word might be a noun or adjective declension
+			SELECT nom||","||gen||","||dat||","||acc||","||inst||","||prep AS temp, word_id
+			FROM declensions
+			UNION
+			-- Search word might be a special adjective inflection
+			SELECT comparative||","||superlative||","||
+			       short_m||","||short_f||","||short_n||","||short_pl AS temp, word_id
+			FROM adjectives
+			UNION
+			-- Search word might be a verb imperative, past form or conjugation
+			SELECT imperative_sg||","||imperative_pl||","||past_m||","||past_f||","||past_n||","||past_pl||
+			       sg1||","||sg2||","||sg3||","||pl1||","||pl2||","||pl3 AS temp, verbs.word_id
+			FROM verbs LEFT JOIN conjugations ON presfut_conj_id = conjugations.id
+		) ON words.id = word_id
+		WHERE LIKELY(disabled = 0) AND ","||completions||"," GLOB '*,%s,*'
+		ORDER BY rank
+	]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
+
+	-- This is an alternative to the above query.
+	-- It eliminates the concatenations, but has to iterate many tables redundantly.
+	-- Effectively it is twice as slow as the above query...
+	cur = assert(con:execute(string.format([[
+		SELECT REPLACE(temp, "'", "") AS completions, words.* FROM words JOIN (
+			-- Search word might be a noun or adjective declension
+			SELECT nom AS temp, word_id FROM declensions
+			UNION ALL
+			SELECT gen AS temp, word_id FROM declensions
+			UNION ALL
+			SELECT dat AS temp, word_id FROM declensions
+			UNION ALL
+			SELECT acc AS temp, word_id FROM declensions
+			UNION ALL
+			SELECT inst AS temp, word_id FROM declensions
+			UNION ALL
+			SELECT prep AS temp, word_id FROM declensions
+			UNION ALL
+			-- Search word might be a special adjective inflection
+			SELECT comparative AS temp, word_id FROM adjectives
+			UNION ALL
+			SELECT superlative AS temp, word_id FROM adjectives
+			UNION ALL
+			SELECT short_m AS temp, word_id FROM adjectives
+			UNION ALL
+			SELECT short_f AS temp, word_id FROM adjectives
+			UNION ALL
+			SELECT short_n AS temp, word_id FROM adjectives
+			UNION ALL
+			SELECT short_pl AS temp, word_id FROM adjectives
+			UNION ALL
+			-- Search word might be a verb imperative or past form
+			SELECT imperative_sg AS temp, word_id FROM verbs
+			UNION ALL
+			SELECT imperative_pl AS temp, word_id FROM verbs
+			UNION ALL
+			SELECT past_m AS temp, word_id FROM verbs
+			UNION ALL
+			SELECT past_f AS temp, word_id FROM verbs
+			UNION ALL
+			SELECT past_n AS temp, word_id FROM verbs
+			UNION ALL
+			SELECT past_pl AS temp, word_id FROM verbs
+			UNION ALL
+			-- Search word might be a verb conjugation
+			SELECT sg1 AS temp, word_id FROM conjugations
+			UNION ALL
+			SELECT sg2 AS temp, word_id FROM conjugations
+			UNION ALL
+			SELECT sg3 AS temp, word_id FROM conjugations
+			UNION ALL
+			SELECT pl1 AS temp, word_id FROM conjugations
+			UNION ALL
+			SELECT pl2 AS temp, word_id FROM conjugations
+			UNION ALL
+			SELECT pl3 AS temp, word_id FROM conjugations
+		) ON words.id = word_id
+		WHERE LIKELY(disabled = 0) AND completions GLOB '%s'
+		ORDER BY rank
+	]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
+	]==]
+
+	-- This query uses a new `bare_inflections` table, since all queries
+	-- using existing tables (see above) are way too slow, especially for
+	-- autocompletions.
+	-- NOTE: The right-hand side of GLOB must be a constant, so that it can be
+	-- optimized using the index.
+	cur = assert(con:execute(string.format([[
+		SELECT bare_inflections.bare AS completions, words.*
+		FROM words JOIN bare_inflections ON words.id = word_id
+		WHERE LIKELY(disabled = 0) AND completions GLOB '%s'
+		ORDER BY rank
+	]], con:escape(lutf8.gsub(search_word, ACCENT, "")))))
+
+	repeat
+		row = cur:fetch({}, "a")
+		table.insert(rows, row)
+	until not row
+
+	cur:close()
+end
+
+-- Only if we do not find a Russian word, we try to find a translation.
+-- This is not wrapped with the above query into one using a LEFT JOIN since
+-- two queries are significantly faster - probably because of having to perform less
+-- string concatenations.
+if #rows == 0 then
+	-- NOTE: The translation entry frequently contains a comma-separated
+	-- list of translations
+	--
+	-- FIXME: Case folding only works for ASCII, which should be sufficient for
+	-- German/English text (almost)...
+	-- FIXME: The string concatenation is a real slow-down and the GLOB cannot
+	-- be optimized.
+	-- Perhaps the translations should be in their own (new) indexed table.
+	cur = assert(con:execute(string.format([[
+		SELECT %s(", "||tl||", ") AS completions, words.*
+		FROM words JOIN translations ON words.id = word_id
+		WHERE LIKELY(disabled = 0) AND lang = '%s' AND completions GLOB %s('*, %s, *')
+		ORDER BY rank
+	]], verbatim and "" or "LOWER", con:escape(lang), verbatim and "" or "LOWER", con:escape(search_word))))
+
+	repeat
+		row = cur:fetch({}, "a")
+		table.insert(rows, row)
+	until not row
+
+	cur:close()
+end
+
+if auto_complete then
+	-- FIXME: See above for notes on case-folding
+	local search_word_bare = lutf8.gsub(search_word, ACCENT, "")
+	search_word_bare = verbatim and search_word_bare or search_word_bare:lower()
+
+	for _, row in ipairs(rows) do
+		-- NOTE: This code is reused for Russian base words, inflections and translated lookups,
+		-- so there is a common `completions` column.
+		-- Russian words can be treated like single-word translations.
+		-- Terms in this column can be comma-separated with and without spaces and
+		-- there may be braces.
+		for word in lutf8.gmatch(row.completions..",", " *%(?(.-)%)?,") do
+			if glob(search_word, word) then
+				io.stdout:write(search_words[#search_words],
+				                lutf8.sub(word, lutf8.len(search_word_bare)), "\n")
+			end
+		end
+	end
+
+	os.exit(true)
+end
+
 if #rows == 0 then
 	io.stderr:write('Word "', search_word, '" not found!\n')
 	os.exit(false)
 end
 
-if #rows == 1 then
-	row = rows[1]
+-- Filter out duplicates
+local word_ids = {}
+local unique_rows = {}
+
+for _, row in ipairs(rows) do
+	if not word_ids[row.id] then
+		table.insert(unique_rows, row)
+		word_ids[row.id] = true
+	end
+end
+
+if #unique_rows == 1 then
+	row = unique_rows[1]
 else
-	for i, row in ipairs(rows) do
+	for i, row in ipairs(unique_rows) do
 		local word_accented = row.accented or row.bare
-		local tl = get_translations(row.word_id)
+		local tl = get_translations(row.id)
 
 		io.stdout:write(i, ") ", map_accented(word_accented))
 		if #tl > 0 then io.stdout:write(" (", table.concat(tl, ", "), ")") end
@@ -258,68 +558,143 @@ else
 	end
 
 	repeat
-		io.stdout:write("Show [1..", #rows, ", press enter to cancel]? "):flush()
+		io.stdout:write("Show [1..", #unique_rows, ", press enter to cancel]? "):flush()
 		local choice = io.stdin:read():lower()
 		if choice == "" or choice == "q" then os.exit() end
-		row = rows[tonumber(choice)]
+		row = unique_rows[tonumber(choice)]
 	until row
 end
 
-local word_id = row.word_id
-local word_type = row.type or "other"
+local word_id = row.id
 -- NOTE: Some words (e.g. personal pronouns) apparently do not
 -- come with accents!?
 local word_accented = row.accented or row.bare
+local word_derived_from = row.derived_from_word_id
+local word_audio = row.audio
+local word_usage = row["usage_"..lang]
+local word_type = row.type or "other"
 
 -- Open stream only now, after no more messages have to be written to
 -- stdout/stderr.
 out_stream = assert(use_stdout and io.stdout or io.popen("man /dev/stdin", "w"))
 
+-- NOTE: The headers and footers shouldn't contain critical information
+-- since they might not be printed at all.
 out_stream:write('.\\" t\n',
-                 '.TH "', search_word, '" "', word_type, '"\n')
+                 '.TH "', row.bare, '" "', word_type, '" "')
+if row.rank then
+	out_stream:write('#', row.rank, row.level and ' ('..row.level..')' or '')
+else
+	out_stream:write(row.level)
+end
+out_stream:write('" "openrussian.lua" "openrussian.org"\n')
+
+--
+-- Generic WORD section with translation.
+--
+out_stream:write('.SH WORD\n',
+                 map_accented(word_accented))
+local tl = get_translations(word_id)
+if #tl > 0 then
+	out_stream:write(' \\-\\- ', table.concat(tl, ', '))
+end
+out_stream:write('\n')
 
 --
 -- Word-specific sections
+-- NOTE: word_accented is required only for format.noun() and could be
+-- avoided altogether.
 --
 format[word_type](word_id, word_accented)
 
 --
 -- Generic sections
 --
-local tl = get_translations(word_id)
-if #tl > 0 then
-	out_stream:write('.SH TRANSLATION\n',
-	                 table.concat(tl, ', '), '\n')
+if word_usage then
+	out_stream:write('.SH USAGE\n',
+	                 word_usage, '\n')
+end
+
+-- FIXME: Perhaps this should rather be part of the SEE ALSO section
+if word_derived_from then
+	cur = assert(con:execute(string.format([[
+		SELECT bare, accented, type FROM words
+		WHERE LIKELY(disabled = 0) AND id = %d
+	]], word_derived_from)))
+	row = assert(cur:fetch({}, "a"))
+	cur:close()
+
+	out_stream:write('.SH DERIVED FROM\n',
+	                 get_reference(row), '\n')
 end
 
 --
--- NOTE: There can be many examples, so print them last.
+-- NOTE: There can be many examples, so print them late.
 --
 cur = assert(con:execute(string.format([[
 	SELECT ru, start, length, tl
 	FROM sentences_words JOIN sentences ON sentence_id = sentences.id
-	WHERE word_id = %d AND lang = "%s"
-]], word_id, lang)))
+	WHERE word_id = %d AND lang = '%s'
+]], word_id, con:escape(lang))))
 row = cur:fetch({}, "a")
 if row then
 	out_stream:write('.SH EXAMPLES\n')
 
 	repeat
-		-- FIXME: Highlight search word in sentences.
-		-- start/length are apparently in characters
-		-- instead of bytes.
-		--[[
-		local ru_hl = row.ru:sub(1, row.start)..'\\fI'..
-		              row.ru:sub(row.start+1, row.start+1+row.length)..'\\fP'..
-		              row.ru:sub(row.start+1+row.length+1)
-		]]
+		-- FIXME: The accent is not always available in the default
+		-- italic font when formatting for PDF.
+		local ru_hl = lutf8.sub(row.ru, 1, row.start)..'\\fI'..
+		              lutf8.sub(row.ru, row.start+1, row.start+1+row.length-1)..'\\fP'..
+		              lutf8.sub(row.ru, row.start+1+row.length)
+
 		out_stream:write('.TP\n',
-		                 map_accented(row.ru), '\n',
+		                 map_accented(ru_hl), '\n',
 		                 row.tl, '\n')
 		row = cur:fetch({}, "a")
 	until not row
+end
+cur:close()
 
+-- Audio recordings might be useful occasionally, but this is an offline/terminal
+-- application, so it makes sense to print them last (like URLs in manpages).
+--
+-- NOTE: There is an UE man-macro, but it doesn't seem to be very helpful here and
+-- seems to bring no advantages when formatting as a PDF.
+-- It could be typset in the default fixed-width font (\fC), but it does not contain
+-- cyrillic characters, so we don't do that either.
+if word_audio then
+	out_stream:write('.SH AUDIO\n',
+	                 word_audio, '\n')
 end
+
+-- Disable adjusting (space-stretching) for the related-word lists.
+-- Don't forget to enable this again if something follows these sections.
+out_stream:write('.na\n')
+
+-- NOTE: The results are grouped by relation, so that they can be
+-- easily printed in one section per relation.
+-- Unfortunately, we cannot define custom collating sequences with LuaSQL.
+-- FIXME: Print this under a single SEE ALSO master section?
+-- FIXME: Results should perhaps be ordered by `type`?
+cur = assert(con:execute(string.format([[
+	SELECT bare, accented, type, relation
+	FROM words_rels JOIN words ON rel_word_id = words.id
+	WHERE LIKELY(disabled = 0) AND words_rels.word_id = %d
+	ORDER BY relation, rank
+]], word_id)))
+
+local cur_relation
+row = cur:fetch({}, "a")
+while row do
+	if cur_relation ~= row.relation then
+		cur_relation = row.relation
+		out_stream:write('.SH ', cur_relation:upper(), '\n')
+	end
+	out_stream:write(get_reference(row))
+	row = cur:fetch({}, "a")
+	out_stream:write(row and row.relation == cur_relation and ', ' or '\n')
+end
+
 cur:close()
 
 --
diff --git a/postprocess.sql b/postprocess.sql
new file mode 100644
index 0000000..5976cb1
--- /dev/null
+++ b/postprocess.sql
@@ -0,0 +1,78 @@
+-- Drop unused tables, hopefully saving a few megabytes.
+-- `expressions_words` is for excercises???
+DROP TABLE expressions_words;
+-- `categories_words2` references `categories` but is not in the database!?
+DROP TABLE categories_words2;
+
+-- This table is especially added to aid lookups and autocompletions on all
+-- possible inflections.
+-- This is significantly faster than using the existing tables.
+--
+-- FIXME: Some of the columns contain multiple comma-separated values
+-- (sometimes in braces), so we probably need a Lua script for initializion.
+-- TODO: Translations should probably be in this table, or a separate one as well.
+-- But see above.
+CREATE TABLE bare_inflections (`word_id` INTEGER NOT NULL, `bare` VARCHAR(100) NOT NULL COLLATE BINARY);
+-- Indexing the `bare` column allows ultra-fast lookups even via GLOB
+-- in most common cases. For this optimization to work, the column must
+-- also be BINARY collated. See also:
+-- https://www.sqlite.org/optoverview.html#the_like_optimization
+CREATE INDEX idx_bare_inflections_bare ON bare_inflections (`bare`);
+INSERT INTO bare_inflections SELECT word_id, REPLACE(temp, "'", "") AS bare
+FROM (
+	-- Search word might be a noun or adjective declension
+	SELECT word_id, nom AS temp FROM declensions
+	UNION ALL
+	SELECT word_id, gen AS temp FROM declensions
+	UNION ALL
+	SELECT word_id, dat AS temp FROM declensions
+	UNION ALL
+	SELECT word_id, acc AS temp FROM declensions
+	UNION ALL
+	SELECT word_id, inst AS temp FROM declensions
+	UNION ALL
+	SELECT word_id, prep AS temp FROM declensions
+	UNION ALL
+	-- Search word might be a special adjective inflection
+	SELECT word_id, comparative AS temp FROM adjectives
+	UNION ALL
+	SELECT word_id, superlative AS temp FROM adjectives
+	UNION ALL
+	SELECT word_id, short_m AS temp FROM adjectives
+	UNION ALL
+	SELECT word_id, short_f AS temp FROM adjectives
+	UNION ALL
+	SELECT word_id, short_n AS temp FROM adjectives
+	UNION ALL
+	SELECT word_id, short_pl AS temp FROM adjectives
+	UNION ALL
+	-- Search word might be a verb imperative or past form
+	SELECT word_id, imperative_sg AS temp FROM verbs
+	UNION ALL
+	SELECT word_id, imperative_pl AS temp FROM verbs
+	UNION ALL
+	SELECT word_id, past_m AS temp FROM verbs
+	UNION ALL
+	SELECT word_id, past_f AS temp FROM verbs
+	UNION ALL
+	SELECT word_id, past_n AS temp FROM verbs
+	UNION ALL
+	SELECT word_id, past_pl AS temp FROM verbs
+	UNION ALL
+	-- Search word might be a verb conjugation
+	SELECT word_id, sg1 AS temp FROM conjugations
+	UNION ALL
+	SELECT word_id, sg2 AS temp FROM conjugations
+	UNION ALL
+	SELECT word_id, sg3 AS temp FROM conjugations
+	UNION ALL
+	SELECT word_id, pl1 AS temp FROM conjugations
+	UNION ALL
+	SELECT word_id, pl2 AS temp FROM conjugations
+	UNION ALL
+	SELECT word_id, pl3 AS temp FROM conjugations
+)
+WHERE bare <> "";
+
+-- Saves a few megabytes
+VACUUM;
author	Robin Haberkorn <robin.haberkorn@googlemail.com>	2019-04-18 23:20:46 +0300
committer	Robin Haberkorn <robin.haberkorn@googlemail.com>	2019-04-18 23:20:46 +0300
commit	481c69be02c95fbf20567d8de617315484e2d692 (patch)
tree	71bf406543586f278a5e779a0befa57f0fe529e9
parent	07d4fa1fe85eba554becb20bb4a872acb320a423 (diff)
download	openrussian-cli-481c69be02c95fbf20567d8de617315484e2d692.tar.gz