/*
 * Copyright 2025 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/***MODULE:chartable
 * rspamd module that make marks based on symbol chains
 *
 * Allowed options:
 * - symbol (string): symbol to insert (default: 'R_BAD_CHARSET')
 * - threshold (double): value that would be used as threshold in expression characters_changed / total_characters
 *   (e.g. if threshold is 0.1 than charset change should occur more often than in 10 symbols), default: 0.1
 */


#include "config.h"
#include "libmime/message.h"
#include "rspamd.h"
#include "libstat/stat_api.h"
#include "libmime/lang_detection.h"

#include "unicode/utf8.h"
#include "unicode/uchar.h"
#include "contrib/ankerl/unordered_dense.h"

#define DEFAULT_SYMBOL "R_MIXED_CHARSET"
#define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL"
#define DEFAULT_THRESHOLD 0.1

#define msg_debug_chartable(...) rspamd_conditional_debug_fast(nullptr, task->from_addr,                                       \
															   rspamd_chartable_log_id, "chartable", task->task_pool->tag.uid, \
															   G_STRFUNC,                                                      \
															   __VA_ARGS__)

INIT_LOG_MODULE(chartable)

/* Initialization */
int chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);

int chartable_module_config(struct rspamd_config *cfg, bool validate);

int chartable_module_reconfig(struct rspamd_config *cfg);

module_t chartable_module = {
	"chartable",
	chartable_module_init,
	chartable_module_config,
	chartable_module_reconfig,
	nullptr,
	RSPAMD_MODULE_VER,
	(unsigned int) -1,
};

struct chartable_ctx {
	struct module_ctx ctx;
	const char *symbol;
	const char *url_symbol;
	double threshold;
	unsigned int max_word_len;
};

static inline struct chartable_ctx *
chartable_get_context(struct rspamd_config *cfg)
{
	return (struct chartable_ctx *) g_ptr_array_index(cfg->c_modules,
													  chartable_module.ctx_offset);
}

static void chartable_symbol_callback(struct rspamd_task *task,
									  struct rspamd_symcache_dynamic_item *item,
									  void *unused);

static void chartable_url_symbol_callback(struct rspamd_task *task,
										  struct rspamd_symcache_dynamic_item *item,
										  void *unused);

int chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
{
	struct chartable_ctx *chartable_module_ctx;

	chartable_module_ctx = rspamd_mempool_alloc0_type(cfg->cfg_pool,
													  struct chartable_ctx);
	chartable_module_ctx->max_word_len = 10;

	*ctx = (struct module_ctx *) chartable_module_ctx;

	return 0;
}


int chartable_module_config(struct rspamd_config *cfg, bool _)
{
	const ucl_object_t *value;
	int res = TRUE;
	struct chartable_ctx *chartable_module_ctx = chartable_get_context(cfg);

	if (!rspamd_config_is_module_enabled(cfg, "chartable")) {
		return TRUE;
	}

	if ((value =
			 rspamd_config_get_module_opt(cfg, "chartable", "symbol")) != nullptr) {
		chartable_module_ctx->symbol = ucl_obj_tostring(value);
	}
	else {
		chartable_module_ctx->symbol = DEFAULT_SYMBOL;
	}
	if ((value =
			 rspamd_config_get_module_opt(cfg, "chartable", "url_symbol")) != nullptr) {
		chartable_module_ctx->url_symbol = ucl_obj_tostring(value);
	}
	else {
		chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL;
	}
	if ((value =
			 rspamd_config_get_module_opt(cfg, "chartable", "threshold")) != nullptr) {
		if (!ucl_obj_todouble_safe(value, &chartable_module_ctx->threshold)) {
			msg_warn_config("invalid numeric value");
			chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
		}
	}
	else {
		chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
	}
	if ((value =
			 rspamd_config_get_module_opt(cfg, "chartable", "max_word_len")) != nullptr) {
		chartable_module_ctx->max_word_len = ucl_object_toint(value);
	}
	else {
		chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
	}

	rspamd_symcache_add_symbol(cfg->cache,
							   chartable_module_ctx->symbol,
							   0,
							   chartable_symbol_callback,
							   nullptr,
							   SYMBOL_TYPE_NORMAL,
							   -1);
	rspamd_symcache_add_symbol(cfg->cache,
							   chartable_module_ctx->url_symbol,
							   0,
							   chartable_url_symbol_callback,
							   nullptr,
							   SYMBOL_TYPE_NORMAL,
							   -1);

	msg_info_config("init internal chartable module");

	return res;
}

int chartable_module_reconfig(struct rspamd_config *cfg)
{
	return chartable_module_config(cfg, false);
}

static const auto latin_confusable = ankerl::unordered_dense::set<int>{
	0x02028,
	0x02029,
	0x01680,
	0x02000,
	0x02001,
	0x02002,
	0x02003,
	0x02004,
	0x02005,
	0x02006,
	0x02008,
	0x02009,
	0x0200a,
	0x0205f,
	0x000a0,
	0x02007,
	0x0202f,
	0x007fa,
	0x0fe4d,
	0x0fe4e,
	0x0fe4f,
	0x02010,
	0x02011,
	0x02012,
	0x02013,
	0x0fe58,
	0x006d4,
	0x02043,
	0x002d7,
	0x02212,
	0x02796,
	0x02cba,
	0x0060d,
	0x0066b,
	0x0201a,
	0x000b8,
	0x0a4f9,
	0x0037e,
	0x00903,
	0x00a83,
	0x0ff1a,
	0x00589,
	0x00703,
	0x00704,
	0x016ec,
	0x0fe30,
	0x01803,
	0x01809,
	0x0205a,
	0x005c3,
	0x002f8,
	0x0a789,
	0x02236,
	0x002d0,
	0x0a4fd,
	0x0ff01,
	0x001c3,
	0x02d51,
	0x00294,
	0x00241,
	0x0097d,
	0x013ae,
	0x0a6eb,
	0x1d16d,
	0x02024,
	0x00701,
	0x00702,
	0x0a60e,
	0x10a50,
	0x00660,
	0x006f0,
	0x0a4f8,
	0x0055d,
	0x0ff07,
	0x02018,
	0x02019,
	0x0201b,
	0x02032,
	0x02035,
	0x0055a,
	0x005f3,
	0x00060,
	0x01fef,
	0x0ff40,
	0x000b4,
	0x00384,
	0x01ffd,
	0x01fbd,
	0x01fbf,
	0x01ffe,
	0x002b9,
	0x00374,
	0x002c8,
	0x002ca,
	0x002cb,
	0x002f4,
	0x002bb,
	0x002bd,
	0x002bc,
	0x002be,
	0x0a78c,
	0x005d9,
	0x007f4,
	0x007f5,
	0x0144a,
	0x016cc,
	0x16f51,
	0x16f52,
	0x0ff3b,
	0x02768,
	0x02772,
	0x03014,
	0x0fd3e,
	0x0ff3d,
	0x02769,
	0x02773,
	0x03015,
	0x0fd3f,
	0x02774,
	0x1d114,
	0x02775,
	0x0204e,
	0x0066d,
	0x02217,
	0x1031f,
	0x01735,
	0x02041,
	0x02215,
	0x02044,
	0x02571,
	0x027cb,
	0x029f8,
	0x1d23a,
	0x031d3,
	0x03033,
	0x02cc6,
	0x030ce,
	0x04e3f,
	0x02f03,
	0x0ff3c,
	0x0fe68,
	0x02216,
	0x027cd,
	0x029f5,
	0x029f9,
	0x1d20f,
	0x1d23b,
	0x031d4,
	0x04e36,
	0x02f02,
	0x0a778,
	0x002c4,
	0x002c6,
	0x016ed,
	0x02795,
	0x1029b,
	0x02039,
	0x0276e,
	0x002c2,
	0x1d236,
	0x01438,
	0x016b2,
	0x01400,
	0x02e40,
	0x030a0,
	0x0a4ff,
	0x0203a,
	0x0276f,
	0x002c3,
	0x1d237,
	0x01433,
	0x16f3f,
	0x02053,
	0x002dc,
	0x01fc0,
	0x0223c,
	0x1d7d0,
	0x1d7da,
	0x1d7e4,
	0x1d7ee,
	0x1d7f8,
	0x0a75a,
	0x001a7,
	0x003e8,
	0x0a644,
	0x014bf,
	0x0a6ef,
	0x1d206,
	0x1d7d1,
	0x1d7db,
	0x1d7e5,
	0x1d7ef,
	0x1d7f9,
	0x0a7ab,
	0x0021c,
	0x001b7,
	0x0a76a,
	0x02ccc,
	0x00417,
	0x004e0,
	0x16f3b,
	0x118ca,
	0x1d7d2,
	0x1d7dc,
	0x1d7e6,
	0x1d7f0,
	0x1d7fa,
	0x013ce,
	0x118af,
	0x1d7d3,
	0x1d7dd,
	0x1d7e7,
	0x1d7f1,
	0x1d7fb,
	0x001bc,
	0x118bb,
	0x1d7d4,
	0x1d7de,
	0x1d7e8,
	0x1d7f2,
	0x1d7fc,
	0x02cd2,
	0x00431,
	0x013ee,
	0x118d5,
	0x1d212,
	0x1d7d5,
	0x1d7df,
	0x1d7e9,
	0x1d7f3,
	0x1d7fd,
	0x104d2,
	0x118c6,
	0x00b03,
	0x009ea,
	0x00a6a,
	0x1e8cb,
	0x1d7d6,
	0x1d7e0,
	0x1d7ea,
	0x1d7f4,
	0x1d7fe,
	0x00223,
	0x00222,
	0x1031a,
	0x00a67,
	0x00b68,
	0x009ed,
	0x00d6d,
	0x1d7d7,
	0x1d7e1,
	0x1d7eb,
	0x1d7f5,
	0x1d7ff,
	0x0a76e,
	0x02cca,
	0x118cc,
	0x118ac,
	0x118d6,
	0x0237a,
	0x0ff41,
	0x1d41a,
	0x1d44e,
	0x1d482,
	0x1d4b6,
	0x1d4ea,
	0x1d51e,
	0x1d552,
	0x1d586,
	0x1d5ba,
	0x1d5ee,
	0x1d622,
	0x1d656,
	0x1d68a,
	0x00251,
	0x003b1,
	0x1d6c2,
	0x1d6fc,
	0x1d736,
	0x1d770,
	0x1d7aa,
	0x00430,
	0x0ff21,
	0x1d400,
	0x1d434,
	0x1d468,
	0x1d49c,
	0x1d4d0,
	0x1d504,
	0x1d538,
	0x1d56c,
	0x1d5a0,
	0x1d5d4,
	0x1d608,
	0x1d63c,
	0x1d670,
	0x00391,
	0x1d6a8,
	0x1d6e2,
	0x1d71c,
	0x1d756,
	0x1d790,
	0x00410,
	0x013aa,
	0x015c5,
	0x0a4ee,
	0x16f40,
	0x102a0,
	0x1d41b,
	0x1d44f,
	0x1d483,
	0x1d4b7,
	0x1d4eb,
	0x1d51f,
	0x1d553,
	0x1d587,
	0x1d5bb,
	0x1d5ef,
	0x1d623,
	0x1d657,
	0x1d68b,
	0x00184,
	0x0042c,
	0x013cf,
	0x015af,
	0x0ff22,
	0x0212c,
	0x1d401,
	0x1d435,
	0x1d469,
	0x1d4d1,
	0x1d505,
	0x1d539,
	0x1d56d,
	0x1d5a1,
	0x1d5d5,
	0x1d609,
	0x1d63d,
	0x1d671,
	0x0a7b4,
	0x00392,
	0x1d6a9,
	0x1d6e3,
	0x1d71d,
	0x1d757,
	0x1d791,
	0x00412,
	0x013f4,
	0x015f7,
	0x0a4d0,
	0x10282,
	0x102a1,
	0x10301,
	0x0ff43,
	0x0217d,
	0x1d41c,
	0x1d450,
	0x1d484,
	0x1d4b8,
	0x1d4ec,
	0x1d520,
	0x1d554,
	0x1d588,
	0x1d5bc,
	0x1d5f0,
	0x1d624,
	0x1d658,
	0x1d68c,
	0x01d04,
	0x003f2,
	0x02ca5,
	0x00441,
	0x0abaf,
	0x1043d,
	0x1f74c,
	0x118f2,
	0x118e9,
	0x0ff23,
	0x0216d,
	0x02102,
	0x0212d,
	0x1d402,
	0x1d436,
	0x1d46a,
	0x1d49e,
	0x1d4d2,
	0x1d56e,
	0x1d5a2,
	0x1d5d6,
	0x1d60a,
	0x1d63e,
	0x1d672,
	0x003f9,
	0x02ca4,
	0x00421,
	0x013df,
	0x0a4da,
	0x102a2,
	0x10302,
	0x10415,
	0x1051c,
	0x0217e,
	0x02146,
	0x1d41d,
	0x1d451,
	0x1d485,
	0x1d4b9,
	0x1d4ed,
	0x1d521,
	0x1d555,
	0x1d589,
	0x1d5bd,
	0x1d5f1,
	0x1d625,
	0x1d659,
	0x1d68d,
	0x00501,
	0x013e7,
	0x0146f,
	0x0a4d2,
	0x0216e,
	0x02145,
	0x1d403,
	0x1d437,
	0x1d46b,
	0x1d49f,
	0x1d4d3,
	0x1d507,
	0x1d53b,
	0x1d56f,
	0x1d5a3,
	0x1d5d7,
	0x1d60b,
	0x1d63f,
	0x1d673,
	0x013a0,
	0x015de,
	0x015ea,
	0x0a4d3,
	0x0212e,
	0x0ff45,
	0x0212f,
	0x02147,
	0x1d41e,
	0x1d452,
	0x1d486,
	0x1d4ee,
	0x1d522,
	0x1d556,
	0x1d58a,
	0x1d5be,
	0x1d5f2,
	0x1d626,
	0x1d65a,
	0x1d68e,
	0x0ab32,
	0x00435,
	0x004bd,
	0x022ff,
	0x0ff25,
	0x02130,
	0x1d404,
	0x1d438,
	0x1d46c,
	0x1d4d4,
	0x1d508,
	0x1d53c,
	0x1d570,
	0x1d5a4,
	0x1d5d8,
	0x1d60c,
	0x1d640,
	0x1d674,
	0x00395,
	0x1d6ac,
	0x1d6e6,
	0x1d720,
	0x1d75a,
	0x1d794,
	0x00415,
	0x02d39,
	0x013ac,
	0x0a4f0,
	0x118a6,
	0x118ae,
	0x10286,
	0x1d41f,
	0x1d453,
	0x1d487,
	0x1d4bb,
	0x1d4ef,
	0x1d523,
	0x1d557,
	0x1d58b,
	0x1d5bf,
	0x1d5f3,
	0x1d627,
	0x1d65b,
	0x1d68f,
	0x0ab35,
	0x0a799,
	0x0017f,
	0x01e9d,
	0x00584,
	0x1d213,
	0x02131,
	0x1d405,
	0x1d439,
	0x1d46d,
	0x1d4d5,
	0x1d509,
	0x1d53d,
	0x1d571,
	0x1d5a5,
	0x1d5d9,
	0x1d60d,
	0x1d641,
	0x1d675,
	0x0a798,
	0x003dc,
	0x1d7ca,
	0x015b4,
	0x0a4dd,
	0x118c2,
	0x118a2,
	0x10287,
	0x102a5,
	0x10525,
	0x0ff47,
	0x0210a,
	0x1d420,
	0x1d454,
	0x1d488,
	0x1d4f0,
	0x1d524,
	0x1d558,
	0x1d58c,
	0x1d5c0,
	0x1d5f4,
	0x1d628,
	0x1d65c,
	0x1d690,
	0x00261,
	0x01d83,
	0x0018d,
	0x00581,
	0x1d406,
	0x1d43a,
	0x1d46e,
	0x1d4a2,
	0x1d4d6,
	0x1d50a,
	0x1d53e,
	0x1d572,
	0x1d5a6,
	0x1d5da,
	0x1d60e,
	0x1d642,
	0x1d676,
	0x0050c,
	0x013c0,
	0x013f3,
	0x0a4d6,
	0x0ff48,
	0x0210e,
	0x1d421,
	0x1d489,
	0x1d4bd,
	0x1d4f1,
	0x1d525,
	0x1d559,
	0x1d58d,
	0x1d5c1,
	0x1d5f5,
	0x1d629,
	0x1d65d,
	0x1d691,
	0x004bb,
	0x00570,
	0x013c2,
	0x0ff28,
	0x0210b,
	0x0210c,
	0x0210d,
	0x1d407,
	0x1d43b,
	0x1d46f,
	0x1d4d7,
	0x1d573,
	0x1d5a7,
	0x1d5db,
	0x1d60f,
	0x1d643,
	0x1d677,
	0x00397,
	0x1d6ae,
	0x1d6e8,
	0x1d722,
	0x1d75c,
	0x1d796,
	0x02c8e,
	0x0041d,
	0x013bb,
	0x0157c,
	0x0a4e7,
	0x102cf,
	0x002db,
	0x02373,
	0x0ff49,
	0x02170,
	0x02139,
	0x02148,
	0x1d422,
	0x1d456,
	0x1d48a,
	0x1d4be,
	0x1d4f2,
	0x1d526,
	0x1d55a,
	0x1d58e,
	0x1d5c2,
	0x1d5f6,
	0x1d62a,
	0x1d65e,
	0x1d692,
	0x00131,
	0x1d6a4,
	0x0026a,
	0x00269,
	0x003b9,
	0x01fbe,
	0x0037a,
	0x1d6ca,
	0x1d704,
	0x1d73e,
	0x1d778,
	0x1d7b2,
	0x00456,
	0x0a647,
	0x004cf,
	0x0ab75,
	0x013a5,
	0x118c3,
	0x0ff4a,
	0x02149,
	0x1d423,
	0x1d457,
	0x1d48b,
	0x1d4bf,
	0x1d4f3,
	0x1d527,
	0x1d55b,
	0x1d58f,
	0x1d5c3,
	0x1d5f7,
	0x1d62b,
	0x1d65f,
	0x1d693,
	0x003f3,
	0x00458,
	0x0ff2a,
	0x1d409,
	0x1d43d,
	0x1d471,
	0x1d4a5,
	0x1d4d9,
	0x1d50d,
	0x1d541,
	0x1d575,
	0x1d5a9,
	0x1d5dd,
	0x1d611,
	0x1d645,
	0x1d679,
	0x0a7b2,
	0x0037f,
	0x00408,
	0x013ab,
	0x0148d,
	0x0a4d9,
	0x1d424,
	0x1d458,
	0x1d48c,
	0x1d4c0,
	0x1d4f4,
	0x1d528,
	0x1d55c,
	0x1d590,
	0x1d5c4,
	0x1d5f8,
	0x1d62c,
	0x1d660,
	0x1d694,
	0x0212a,
	0x0ff2b,
	0x1d40a,
	0x1d43e,
	0x1d472,
	0x1d4a6,
	0x1d4da,
	0x1d50e,
	0x1d542,
	0x1d576,
	0x1d5aa,
	0x1d5de,
	0x1d612,
	0x1d646,
	0x1d67a,
	0x0039a,
	0x1d6b1,
	0x1d6eb,
	0x1d725,
	0x1d75f,
	0x1d799,
	0x02c94,
	0x0041a,
	0x013e6,
	0x016d5,
	0x0a4d7,
	0x10518,
	0x005c0,
	0x0007c,
	0x02223,
	0x023fd,
	0x0ffe8,
	0x00031,
	0x00661,
	0x006f1,
	0x10320,
	0x1e8c7,
	0x1d7cf,
	0x1d7d9,
	0x1d7e3,
	0x1d7ed,
	0x1d7f7,
	0x00049,
	0x0ff29,
	0x02160,
	0x02110,
	0x02111,
	0x1d408,
	0x1d43c,
	0x1d470,
	0x1d4d8,
	0x1d540,
	0x1d574,
	0x1d5a8,
	0x1d5dc,
	0x1d610,
	0x1d644,
	0x1d678,
	0x00196,
	0x0ff4c,
	0x0217c,
	0x02113,
	0x1d425,
	0x1d459,
	0x1d48d,
	0x1d4c1,
	0x1d4f5,
	0x1d529,
	0x1d55d,
	0x1d591,
	0x1d5c5,
	0x1d5f9,
	0x1d62d,
	0x1d661,
	0x1d695,
	0x001c0,
	0x00399,
	0x1d6b0,
	0x1d6ea,
	0x1d724,
	0x1d75e,
	0x1d798,
	0x02c92,
	0x00406,
	0x004c0,
	0x005d5,
	0x005df,
	0x00627,
	0x1ee00,
	0x1ee80,
	0x0fe8e,
	0x0fe8d,
	0x007ca,
	0x02d4f,
	0x016c1,
	0x0a4f2,
	0x16f28,
	0x1028a,
	0x10309,
	0x1d22a,
	0x0216c,
	0x02112,
	0x1d40b,
	0x1d43f,
	0x1d473,
	0x1d4db,
	0x1d50f,
	0x1d543,
	0x1d577,
	0x1d5ab,
	0x1d5df,
	0x1d613,
	0x1d647,
	0x1d67b,
	0x02cd0,
	0x013de,
	0x014aa,
	0x0a4e1,
	0x16f16,
	0x118a3,
	0x118b2,
	0x1041b,
	0x10526,
	0x0ff2d,
	0x0216f,
	0x02133,
	0x1d40c,
	0x1d440,
	0x1d474,
	0x1d4dc,
	0x1d510,
	0x1d544,
	0x1d578,
	0x1d5ac,
	0x1d5e0,
	0x1d614,
	0x1d648,
	0x1d67c,
	0x0039c,
	0x1d6b3,
	0x1d6ed,
	0x1d727,
	0x1d761,
	0x1d79b,
	0x003fa,
	0x02c98,
	0x0041c,
	0x013b7,
	0x015f0,
	0x016d6,
	0x0a4df,
	0x102b0,
	0x10311,
	0x1d427,
	0x1d45b,
	0x1d48f,
	0x1d4c3,
	0x1d4f7,
	0x1d52b,
	0x1d55f,
	0x1d593,
	0x1d5c7,
	0x1d5fb,
	0x1d62f,
	0x1d663,
	0x1d697,
	0x00578,
	0x0057c,
	0x0ff2e,
	0x02115,
	0x1d40d,
	0x1d441,
	0x1d475,
	0x1d4a9,
	0x1d4dd,
	0x1d511,
	0x1d579,
	0x1d5ad,
	0x1d5e1,
	0x1d615,
	0x1d649,
	0x1d67d,
	0x0039d,
	0x1d6b4,
	0x1d6ee,
	0x1d728,
	0x1d762,
	0x1d79c,
	0x02c9a,
	0x0a4e0,
	0x10513,
	0x00c02,
	0x00c82,
	0x00d02,
	0x00d82,
	0x00966,
	0x00a66,
	0x00ae6,
	0x00be6,
	0x00c66,
	0x00ce6,
	0x00d66,
	0x00e50,
	0x00ed0,
	0x01040,
	0x00665,
	0x006f5,
	0x0ff4f,
	0x02134,
	0x1d428,
	0x1d45c,
	0x1d490,
	0x1d4f8,
	0x1d52c,
	0x1d560,
	0x1d594,
	0x1d5c8,
	0x1d5fc,
	0x1d630,
	0x1d664,
	0x1d698,
	0x01d0f,
	0x01d11,
	0x0ab3d,
	0x003bf,
	0x1d6d0,
	0x1d70a,
	0x1d744,
	0x1d77e,
	0x1d7b8,
	0x003c3,
	0x1d6d4,
	0x1d70e,
	0x1d748,
	0x1d782,
	0x1d7bc,
	0x02c9f,
	0x0043e,
	0x010ff,
	0x00585,
	0x005e1,
	0x00647,
	0x1ee24,
	0x1ee64,
	0x1ee84,
	0x0feeb,
	0x0feec,
	0x0feea,
	0x0fee9,
	0x006be,
	0x0fbac,
	0x0fbad,
	0x0fbab,
	0x0fbaa,
	0x006c1,
	0x0fba8,
	0x0fba9,
	0x0fba7,
	0x0fba6,
	0x006d5,
	0x00d20,
	0x0101d,
	0x104ea,
	0x118c8,
	0x118d7,
	0x1042c,
	0x00030,
	0x007c0,
	0x009e6,
	0x00b66,
	0x03007,
	0x114d0,
	0x118e0,
	0x1d7ce,
	0x1d7d8,
	0x1d7e2,
	0x1d7ec,
	0x1d7f6,
	0x0ff2f,
	0x1d40e,
	0x1d442,
	0x1d476,
	0x1d4aa,
	0x1d4de,
	0x1d512,
	0x1d546,
	0x1d57a,
	0x1d5ae,
	0x1d5e2,
	0x1d616,
	0x1d64a,
	0x1d67e,
	0x0039f,
	0x1d6b6,
	0x1d6f0,
	0x1d72a,
	0x1d764,
	0x1d79e,
	0x02c9e,
	0x0041e,
	0x00555,
	0x02d54,
	0x012d0,
	0x00b20,
	0x104c2,
	0x0a4f3,
	0x118b5,
	0x10292,
	0x102ab,
	0x10404,
	0x10516,
	0x02374,
	0x0ff50,
	0x1d429,
	0x1d45d,
	0x1d491,
	0x1d4c5,
	0x1d4f9,
	0x1d52d,
	0x1d561,
	0x1d595,
	0x1d5c9,
	0x1d5fd,
	0x1d631,
	0x1d665,
	0x1d699,
	0x003c1,
	0x003f1,
	0x1d6d2,
	0x1d6e0,
	0x1d70c,
	0x1d71a,
	0x1d746,
	0x1d754,
	0x1d780,
	0x1d78e,
	0x1d7ba,
	0x1d7c8,
	0x02ca3,
	0x00440,
	0x0ff30,
	0x02119,
	0x1d40f,
	0x1d443,
	0x1d477,
	0x1d4ab,
	0x1d4df,
	0x1d513,
	0x1d57b,
	0x1d5af,
	0x1d5e3,
	0x1d617,
	0x1d64b,
	0x1d67f,
	0x003a1,
	0x1d6b8,
	0x1d6f2,
	0x1d72c,
	0x1d766,
	0x1d7a0,
	0x02ca2,
	0x00420,
	0x013e2,
	0x0146d,
	0x0a4d1,
	0x10295,
	0x1d42a,
	0x1d45e,
	0x1d492,
	0x1d4c6,
	0x1d4fa,
	0x1d52e,
	0x1d562,
	0x1d596,
	0x1d5ca,
	0x1d5fe,
	0x1d632,
	0x1d666,
	0x1d69a,
	0x0051b,
	0x00563,
	0x00566,
	0x0211a,
	0x1d410,
	0x1d444,
	0x1d478,
	0x1d4ac,
	0x1d4e0,
	0x1d514,
	0x1d57c,
	0x1d5b0,
	0x1d5e4,
	0x1d618,
	0x1d64c,
	0x1d680,
	0x02d55,
	0x1d42b,
	0x1d45f,
	0x1d493,
	0x1d4c7,
	0x1d4fb,
	0x1d52f,
	0x1d563,
	0x1d597,
	0x1d5cb,
	0x1d5ff,
	0x1d633,
	0x1d667,
	0x1d69b,
	0x0ab47,
	0x0ab48,
	0x01d26,
	0x02c85,
	0x00433,
	0x0ab81,
	0x1d216,
	0x0211b,
	0x0211c,
	0x0211d,
	0x1d411,
	0x1d445,
	0x1d479,
	0x1d4e1,
	0x1d57d,
	0x1d5b1,
	0x1d5e5,
	0x1d619,
	0x1d64d,
	0x1d681,
	0x001a6,
	0x013a1,
	0x013d2,
	0x104b4,
	0x01587,
	0x0a4e3,
	0x16f35,
	0x0ff53,
	0x1d42c,
	0x1d460,
	0x1d494,
	0x1d4c8,
	0x1d4fc,
	0x1d530,
	0x1d564,
	0x1d598,
	0x1d5cc,
	0x1d600,
	0x1d634,
	0x1d668,
	0x1d69c,
	0x0a731,
	0x001bd,
	0x00455,
	0x0abaa,
	0x118c1,
	0x10448,
	0x0ff33,
	0x1d412,
	0x1d446,
	0x1d47a,
	0x1d4ae,
	0x1d4e2,
	0x1d516,
	0x1d54a,
	0x1d57e,
	0x1d5b2,
	0x1d5e6,
	0x1d61a,
	0x1d64e,
	0x1d682,
	0x00405,
	0x0054f,
	0x013d5,
	0x013da,
	0x0a4e2,
	0x16f3a,
	0x10296,
	0x10420,
	0x1d42d,
	0x1d461,
	0x1d495,
	0x1d4c9,
	0x1d4fd,
	0x1d531,
	0x1d565,
	0x1d599,
	0x1d5cd,
	0x1d601,
	0x1d635,
	0x1d669,
	0x1d69d,
	0x022a4,
	0x027d9,
	0x1f768,
	0x0ff34,
	0x1d413,
	0x1d447,
	0x1d47b,
	0x1d4af,
	0x1d4e3,
	0x1d517,
	0x1d54b,
	0x1d57f,
	0x1d5b3,
	0x1d5e7,
	0x1d61b,
	0x1d64f,
	0x1d683,
	0x003a4,
	0x1d6bb,
	0x1d6f5,
	0x1d72f,
	0x1d769,
	0x1d7a3,
	0x02ca6,
	0x00422,
	0x013a2,
	0x0a4d4,
	0x16f0a,
	0x118bc,
	0x10297,
	0x102b1,
	0x10315,
	0x1d42e,
	0x1d462,
	0x1d496,
	0x1d4ca,
	0x1d4fe,
	0x1d532,
	0x1d566,
	0x1d59a,
	0x1d5ce,
	0x1d602,
	0x1d636,
	0x1d66a,
	0x1d69e,
	0x0a79f,
	0x01d1c,
	0x0ab4e,
	0x0ab52,
	0x0028b,
	0x003c5,
	0x1d6d6,
	0x1d710,
	0x1d74a,
	0x1d784,
	0x1d7be,
	0x0057d,
	0x104f6,
	0x118d8,
	0x0222a,
	0x022c3,
	0x1d414,
	0x1d448,
	0x1d47c,
	0x1d4b0,
	0x1d4e4,
	0x1d518,
	0x1d54c,
	0x1d580,
	0x1d5b4,
	0x1d5e8,
	0x1d61c,
	0x1d650,
	0x1d684,
	0x0054d,
	0x01200,
	0x104ce,
	0x0144c,
	0x0a4f4,
	0x16f42,
	0x118b8,
	0x02228,
	0x022c1,
	0x0ff56,
	0x02174,
	0x1d42f,
	0x1d463,
	0x1d497,
	0x1d4cb,
	0x1d4ff,
	0x1d533,
	0x1d567,
	0x1d59b,
	0x1d5cf,
	0x1d603,
	0x1d637,
	0x1d66b,
	0x1d69f,
	0x01d20,
	0x003bd,
	0x1d6ce,
	0x1d708,
	0x1d742,
	0x1d77c,
	0x1d7b6,
	0x00475,
	0x005d8,
	0x11706,
	0x0aba9,
	0x118c0,
	0x1d20d,
	0x00667,
	0x006f7,
	0x02164,
	0x1d415,
	0x1d449,
	0x1d47d,
	0x1d4b1,
	0x1d4e5,
	0x1d519,
	0x1d54d,
	0x1d581,
	0x1d5b5,
	0x1d5e9,
	0x1d61d,
	0x1d651,
	0x1d685,
	0x00474,
	0x02d38,
	0x013d9,
	0x0142f,
	0x0a6df,
	0x0a4e6,
	0x16f08,
	0x118a0,
	0x1051d,
	0x0026f,
	0x1d430,
	0x1d464,
	0x1d498,
	0x1d4cc,
	0x1d500,
	0x1d534,
	0x1d568,
	0x1d59c,
	0x1d5d0,
	0x1d604,
	0x1d638,
	0x1d66c,
	0x1d6a0,
	0x01d21,
	0x00461,
	0x0051d,
	0x00561,
	0x1170a,
	0x1170e,
	0x1170f,
	0x0ab83,
	0x118ef,
	0x118e6,
	0x1d416,
	0x1d44a,
	0x1d47e,
	0x1d4b2,
	0x1d4e6,
	0x1d51a,
	0x1d54e,
	0x1d582,
	0x1d5b6,
	0x1d5ea,
	0x1d61e,
	0x1d652,
	0x1d686,
	0x0051c,
	0x013b3,
	0x013d4,
	0x0a4ea,
	0x0166e,
	0x000d7,
	0x0292b,
	0x0292c,
	0x02a2f,
	0x0ff58,
	0x02179,
	0x1d431,
	0x1d465,
	0x1d499,
	0x1d4cd,
	0x1d501,
	0x1d535,
	0x1d569,
	0x1d59d,
	0x1d5d1,
	0x1d605,
	0x1d639,
	0x1d66d,
	0x1d6a1,
	0x00445,
	0x01541,
	0x0157d,
	0x0166d,
	0x02573,
	0x10322,
	0x118ec,
	0x0ff38,
	0x02169,
	0x1d417,
	0x1d44b,
	0x1d47f,
	0x1d4b3,
	0x1d4e7,
	0x1d51b,
	0x1d54f,
	0x1d583,
	0x1d5b7,
	0x1d5eb,
	0x1d61f,
	0x1d653,
	0x1d687,
	0x0a7b3,
	0x003a7,
	0x1d6be,
	0x1d6f8,
	0x1d732,
	0x1d76c,
	0x1d7a6,
	0x02cac,
	0x00425,
	0x02d5d,
	0x016b7,
	0x0a4eb,
	0x10290,
	0x102b4,
	0x10317,
	0x10527,
	0x00263,
	0x01d8c,
	0x0ff59,
	0x1d432,
	0x1d466,
	0x1d49a,
	0x1d4ce,
	0x1d502,
	0x1d536,
	0x1d56a,
	0x1d59e,
	0x1d5d2,
	0x1d606,
	0x1d63a,
	0x1d66e,
	0x1d6a2,
	0x0028f,
	0x01eff,
	0x0ab5a,
	0x003b3,
	0x0213d,
	0x1d6c4,
	0x1d6fe,
	0x1d738,
	0x1d772,
	0x1d7ac,
	0x00443,
	0x004af,
	0x010e7,
	0x118dc,
	0x0ff39,
	0x1d418,
	0x1d44c,
	0x1d480,
	0x1d4b4,
	0x1d4e8,
	0x1d51c,
	0x1d550,
	0x1d584,
	0x1d5b8,
	0x1d5ec,
	0x1d620,
	0x1d654,
	0x1d688,
	0x003a5,
	0x003d2,
	0x1d6bc,
	0x1d6f6,
	0x1d730,
	0x1d76a,
	0x1d7a4,
	0x02ca8,
	0x00423,
	0x004ae,
	0x013a9,
	0x013bd,
	0x0a4ec,
	0x16f43,
	0x118a4,
	0x102b2,
	0x1d433,
	0x1d467,
	0x1d49b,
	0x1d4cf,
	0x1d503,
	0x1d537,
	0x1d56b,
	0x1d59f,
	0x1d5d3,
	0x1d607,
	0x1d63b,
	0x1d66f,
	0x1d6a3,
	0x01d22,
	0x0ab93,
	0x118c4,
	0x102f5,
	0x118e5,
	0x0ff3a,
	0x02124,
	0x02128,
	0x1d419,
	0x1d44d,
	0x1d481,
	0x1d4b5,
	0x1d4e9,
	0x1d585,
	0x1d5b9,
	0x1d5ed,
	0x1d621,
	0x1d655,
	0x1d689,
	0x00396,
	0x1d6ad,
	0x1d6e7,
	0x1d721,
	0x1d75b,
	0x1d795,
	0x013c3,
	0x0a4dc,
	0x118a9,
};

static gboolean
rspamd_can_alias_latin(int ch)
{
	return latin_confusable.contains(ch);
}

static double
rspamd_chartable_process_word_utf(struct rspamd_task *task,
								  rspamd_word_t *w,
								  gboolean is_url,
								  unsigned int *ncap,
								  struct chartable_ctx *chartable_module_ctx,
								  gboolean ignore_diacritics)
{
	const UChar32 *p, *end;
	double badness = 0.0;
	UChar32 uc;
	UBlockCode sc;
	unsigned int cat;
	int last_is_latin = -1;
	unsigned int same_script_count = 0, nsym = 0, nspecial = 0;
	enum {
		start_process = 0,
		got_alpha,
		got_digit,
		got_unknown,
	} state = start_process,
	  prev_state = start_process;

	p = w->unicode.begin;
	end = p + w->unicode.len;

	/* We assume that w is normalized */

	while (p < end) {
		uc = *p++;

		if (((int32_t) uc) < 0) {
			break;
		}

		sc = ublock_getCode(uc);
		cat = u_charType(uc);

		if (!ignore_diacritics) {
			if (cat == U_NON_SPACING_MARK ||
				(sc == UBLOCK_LATIN_1_SUPPLEMENT) ||
				(sc == UBLOCK_LATIN_EXTENDED_A) ||
				(sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
				(sc == UBLOCK_LATIN_EXTENDED_B) ||
				(sc == UBLOCK_COMBINING_DIACRITICAL_MARKS)) {
				nspecial++;
			}
		}

		if (u_isalpha(uc)) {

			if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS ||
				sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
				/*
				 * Assume all latin, IPA, diacritic and space modifiers
				 * characters as basic latin
				 */
				sc = UBLOCK_BASIC_LATIN;
			}

			if (sc != UBLOCK_BASIC_LATIN && u_isupper(uc)) {
				if (ncap) {
					(*ncap)++;
				}
			}

			if (state == got_digit) {
				/* Penalize digit -> alpha translations */
				if (!is_url && sc != UBLOCK_BASIC_LATIN &&
					prev_state != start_process) {
					badness += 0.25;
				}
			}
			else if (state == got_alpha) {
				/* Check script */
				if (same_script_count > 0) {
					if (sc != UBLOCK_BASIC_LATIN && last_is_latin) {

						if (rspamd_can_alias_latin(uc)) {
							badness += 1.0 / (double) same_script_count;
						}

						last_is_latin = 0;
						same_script_count = 1;
					}
					else {
						same_script_count++;
					}
				}
				else {
					last_is_latin = sc == UBLOCK_BASIC_LATIN;
					same_script_count = 1;
				}
			}

			prev_state = state;
			state = got_alpha;
		}
		else if (u_isdigit(uc)) {
			if (state != got_digit) {
				prev_state = state;
			}

			state = got_digit;
			same_script_count = 0;
		}
		else {
			/* We don't care about unknown characters here */
			if (state != got_unknown) {
				prev_state = state;
			}

			state = got_unknown;
			same_script_count = 0;
		}

		nsym++;
	}

	if (nspecial > 0) {
		if (!ignore_diacritics) {
			/* Count diacritics  */
			badness += nspecial;
		}
		else if (nspecial > 1) {
			badness += (nspecial - 1.0) / 2.0;
		}
	}

	/* Try to avoid FP for long words */
	if (nsym > chartable_module_ctx->max_word_len) {
		badness = 0;
	}
	else {
		if (badness > 4.0) {
			badness = 4.0;
		}
	}

	msg_debug_chartable("word %*s, badness: %.2f",
						(int) w->normalized.len, w->normalized.begin,
						badness);

	return badness;
}

static double
rspamd_chartable_process_word_ascii(struct rspamd_task *task,
									rspamd_word_t *w,
									gboolean is_url,
									struct chartable_ctx *chartable_module_ctx)
{
	double badness = 0.0;
	enum {
		ascii = 1,
		non_ascii
	} sc,
		last_sc;
	int same_script_count = 0, seen_alpha = FALSE;
	enum {
		start_process = 0,
		got_alpha,
		got_digit,
		got_unknown,
	} state = start_process;

	const auto *p = (const unsigned char *) w->normalized.begin;
	const auto *end = p + w->normalized.len;
	last_sc = non_ascii;

	if (w->normalized.len > chartable_module_ctx->max_word_len) {
		return 0.0;
	}

	/* We assume that w is normalized */
	while (p < end) {
		if (g_ascii_isalpha(*p) || *p > 0x7f) {

			if (state == got_digit) {
				/* Penalize digit -> alpha translations */
				if (seen_alpha && !is_url && !g_ascii_isxdigit(*p)) {
					badness += 0.25;
				}
			}
			else if (state == got_alpha) {
				/* Check script */
				sc = (*p > 0x7f) ? ascii : non_ascii;

				if (same_script_count > 0) {
					if (sc != last_sc) {
						badness += 1.0 / (double) same_script_count;
						last_sc = sc;
						same_script_count = 1;
					}
					else {
						same_script_count++;
					}
				}
				else {
					last_sc = sc;
					same_script_count = 1;
				}
			}

			seen_alpha = TRUE;
			state = got_alpha;
		}
		else if (g_ascii_isdigit(*p)) {
			state = got_digit;
			same_script_count = 0;
		}
		else {
			/* We don't care about unknown characters here */
			state = got_unknown;
			same_script_count = 0;
		}

		p++;
	}

	if (badness > 4.0) {
		badness = 4.0;
	}

	msg_debug_chartable("word %*s, badness: %.2f",
						(int) w->normalized.len, w->normalized.begin,
						badness);

	return badness;
}

static gboolean
rspamd_chartable_process_part(struct rspamd_task *task,
							  struct rspamd_mime_text_part *part,
							  struct chartable_ctx *chartable_module_ctx,
							  gboolean ignore_diacritics)
{
	rspamd_word_t *w;
	unsigned int i, ncap = 0;
	double cur_score = 0.0;

	if (part == nullptr || part->utf_words.a == nullptr ||
		kv_size(part->utf_words) == 0 || part->nwords == 0) {
		return FALSE;
	}

	for (i = 0; i < kv_size(part->utf_words); i++) {
		w = &kv_A(part->utf_words, i);

		if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {

			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
				cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
															   &ncap, chartable_module_ctx, ignore_diacritics);
			}
			else {
				cur_score += rspamd_chartable_process_word_ascii(task, w,
																 FALSE, chartable_module_ctx);
			}
		}
	}

	/*
	 * TODO: perhaps, we should do this analysis somewhere else and get
	 * something like: <SYM_SC><SYM_SC><SYM_SC> representing classes for all
	 * symbols in the text
	 */
	part->capital_letters += ncap;

	cur_score /= (double) part->nwords;

	if (cur_score > 1.0) {
		cur_score = 1.0;
	}

	if (cur_score > chartable_module_ctx->threshold) {
		rspamd_task_insert_result(task, chartable_module_ctx->symbol,
								  cur_score, nullptr);
		return TRUE;
	}

	return FALSE;
}

static void
chartable_symbol_callback(struct rspamd_task *task,
						  struct rspamd_symcache_dynamic_item *item,
						  void *_)
{
	unsigned int i;
	struct rspamd_mime_text_part *part;
	struct chartable_ctx *chartable_module_ctx = chartable_get_context(task->cfg);
	gboolean ignore_diacritics = TRUE, seen_violated_part = FALSE;

	/* Check if we have parts with diacritic symbols language */
	PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
	{
		if (part->languages && part->languages->len > 0) {
			auto *lang = (struct rspamd_lang_detector_res *) g_ptr_array_index(part->languages, 0);
			int flags;

			flags = rspamd_language_detector_elt_flags(lang->elt);

			if ((flags & RS_LANGUAGE_DIACRITICS)) {
				ignore_diacritics = TRUE;
			}
			else if (lang->prob > 0.75) {
				ignore_diacritics = FALSE;
			}
		}

		if (rspamd_chartable_process_part(task, part, chartable_module_ctx, ignore_diacritics)) {
			seen_violated_part = TRUE;
		}
	}

	if (MESSAGE_FIELD(task, text_parts)->len == 0) {
		/* No text parts, assume that we should ignore diacritics checks for metatokens */
		ignore_diacritics = TRUE;
	}

	if (task->meta_words.a && kv_size(task->meta_words) > 0) {
		rspamd_word_t *w;
		double cur_score = 0;
		gsize arlen = kv_size(task->meta_words);

		for (i = 0; i < arlen; i++) {
			w = &kv_A(task->meta_words, i);
			cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
														   nullptr, chartable_module_ctx, ignore_diacritics);
		}

		cur_score /= (double) (arlen + 1);

		if (cur_score > 1.0) {
			cur_score = 1.0;
		}

		if (cur_score > chartable_module_ctx->threshold) {
			if (!seen_violated_part) {
				/* Further penalise */
				if (cur_score > 0.25) {
					cur_score = 0.25;
				}
			}

			rspamd_task_insert_result(task, chartable_module_ctx->symbol,
									  cur_score, "subject");
		}
	}

	rspamd_symcache_finalize_item(task, item);
}

static void
chartable_url_symbol_callback(struct rspamd_task *task,
							  struct rspamd_symcache_dynamic_item *item,
							  void *unused)
{
	/* XXX: TODO: unbreak module once URLs unicode project is over */
#if 0
	struct rspamd_url *u;
	GHashTableIter it;
	gpointer k, v;
	rspamd_stat_token_t w;
	double cur_score = 0.0;
	struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg);

	g_hash_table_iter_init (&it, task->urls);

	while (g_hash_table_iter_next (&it, &k, &v)) {
		u = v;

		if (cur_score > 2.0) {
			cur_score = 2.0;
			break;
		}

		if (u->hostlen > 0) {
			w.stemmed.begin = u->host;
			w.stemmed.len = u->hostlen;

			if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, nullptr)) {
				cur_score += rspamd_chartable_process_word_utf (task, &w,
						TRUE, nullptr, chartable_module_ctx);
			}
			else {
				cur_score += rspamd_chartable_process_word_ascii (task, &w,
						TRUE, chartable_module_ctx);
			}
		}
	}

	g_hash_table_iter_init (&it, task->emails);

	while (g_hash_table_iter_next (&it, &k, &v)) {
		u = v;

		if (cur_score > 2.0) {
			cur_score = 2.0;
			break;
		}

		if (u->hostlen > 0) {
			w.stemmed.begin = u->host;
			w.stemmed.len = u->hostlen;

			if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, nullptr)) {
				cur_score += rspamd_chartable_process_word_utf (task, &w,
						TRUE, nullptr, chartable_module_ctx);
			}
			else {
				cur_score += rspamd_chartable_process_word_ascii (task, &w,
						TRUE, chartable_module_ctx);
			}
		}
	}

	if (cur_score > chartable_module_ctx->threshold) {
		rspamd_task_insert_result (task, chartable_module_ctx->symbol,
				cur_score, nullptr);

	}
#endif
	rspamd_symcache_finalize_item(task, item);
}
