beeta/chat-service/src/services/CensorService.cpp
2026-01-05 22:54:27 -05:00

279 lines
9.5 KiB
C++

#include "CensorService.h"
#include <drogon/drogon.h>
#include <drogon/HttpClient.h>
#include <sstream>
#include <algorithm>
#include <cctype>
namespace services {
CensorService& CensorService::getInstance() {
static CensorService instance;
return instance;
}
void CensorService::initialize() {
initialized_ = true;
LOG_INFO << "CensorService initialized";
}
void CensorService::scheduleFetch() {
LOG_INFO << "Scheduling censored words fetch in 2 seconds...";
drogon::app().getLoop()->runAfter(2.0, [this]() {
LOG_INFO << "Pre-fetching censored words from backend...";
fetchCensoredWordsAsync();
});
// No periodic refresh - cache invalidation is triggered by backend
}
void CensorService::invalidateCache() {
LOG_INFO << "Cache invalidation requested, fetching censored words from backend...";
fetchCensoredWordsAsync();
}
void CensorService::fetchCensoredWordsFromBackend() {
auto config = drogon::app().getCustomConfig();
auto backendConfig = config.get("backend_api", Json::Value::null);
std::string host;
int port;
if (backendConfig.isNull() || !backendConfig.isMember("host")) {
host = "drogon-backend";
port = 8080;
} else {
host = backendConfig.get("host", "drogon-backend").asString();
port = backendConfig.get("port", 8080).asInt();
}
auto client = drogon::HttpClient::newHttpClient("http://" + host + ":" + std::to_string(port));
auto req = drogon::HttpRequest::newHttpRequest();
req->setMethod(drogon::Get);
req->setPath("/api/internal/censored-words");
std::pair<drogon::ReqResult, drogon::HttpResponsePtr> result = client->sendRequest(req, 5.0);
if (result.first != drogon::ReqResult::Ok) {
LOG_ERROR << "Failed to fetch censored words from backend: request failed";
return;
}
auto resp = result.second;
if (resp->getStatusCode() != drogon::k200OK) {
LOG_ERROR << "Failed to fetch censored words from backend: HTTP " << resp->getStatusCode();
return;
}
try {
auto json = resp->getJsonObject();
if (!json || !(*json)["success"].asBool()) {
LOG_ERROR << "Failed to fetch censored words: invalid response";
return;
}
std::string wordsStr = (*json)["censored_words"].asString();
// Build new data in temporary variables
std::vector<std::string> newWords;
std::optional<std::regex> newPattern;
if (!wordsStr.empty()) {
std::stringstream ss(wordsStr);
std::string word;
while (std::getline(ss, word, ',') && newWords.size() < MAX_WORD_COUNT) {
size_t start = word.find_first_not_of(" \t\r\n");
size_t end = word.find_last_not_of(" \t\r\n");
if (start != std::string::npos && end != std::string::npos) {
word = word.substr(start, end - start + 1);
// Skip empty words and words exceeding max length (ReDoS prevention)
if (!word.empty() && word.length() <= MAX_WORD_LENGTH) {
newWords.push_back(word);
} else if (word.length() > MAX_WORD_LENGTH) {
LOG_WARN << "Skipping censored word exceeding " << MAX_WORD_LENGTH << " chars";
}
}
}
newPattern = buildCombinedPattern(newWords);
}
// Atomic swap under lock
{
std::unique_lock<std::shared_mutex> lock(mutex_);
censoredWords_ = std::move(newWords);
combinedPattern_ = std::move(newPattern);
}
LOG_DEBUG << "Fetched " << censoredWords_.size() << " censored words from backend";
} catch (const std::exception& e) {
LOG_ERROR << "Error parsing censored words response: " << e.what();
}
}
void CensorService::fetchCensoredWordsAsync() {
auto config = drogon::app().getCustomConfig();
auto backendConfig = config.get("backend_api", Json::Value::null);
std::string host;
int port;
if (backendConfig.isNull() || !backendConfig.isMember("host")) {
host = "drogon-backend";
port = 8080;
} else {
host = backendConfig.get("host", "drogon-backend").asString();
port = backendConfig.get("port", 8080).asInt();
}
std::string url = "http://" + host + ":" + std::to_string(port);
auto client = drogon::HttpClient::newHttpClient(url, drogon::app().getLoop());
auto req = drogon::HttpRequest::newHttpRequest();
req->setMethod(drogon::Get);
req->setPath("/api/internal/censored-words");
client->sendRequest(req, [this, client](drogon::ReqResult result, const drogon::HttpResponsePtr& resp) {
if (result != drogon::ReqResult::Ok) {
LOG_ERROR << "Async fetch censored words failed";
return;
}
if (resp->getStatusCode() != drogon::k200OK) {
LOG_ERROR << "Async fetch censored words failed: HTTP " << resp->getStatusCode();
return;
}
try {
auto json = resp->getJsonObject();
if (!json || !(*json)["success"].asBool()) {
LOG_ERROR << "Async fetch censored words: invalid response";
return;
}
std::string wordsStr = (*json)["censored_words"].asString();
// Build new data in temporary variables
std::vector<std::string> newWords;
std::optional<std::regex> newPattern;
if (!wordsStr.empty()) {
std::stringstream ss(wordsStr);
std::string word;
while (std::getline(ss, word, ',') && newWords.size() < MAX_WORD_COUNT) {
size_t start = word.find_first_not_of(" \t\r\n");
size_t end = word.find_last_not_of(" \t\r\n");
if (start != std::string::npos && end != std::string::npos) {
word = word.substr(start, end - start + 1);
// Skip empty words and words exceeding max length (ReDoS prevention)
if (!word.empty() && word.length() <= MAX_WORD_LENGTH) {
newWords.push_back(word);
} else if (word.length() > MAX_WORD_LENGTH) {
LOG_WARN << "Skipping censored word exceeding " << MAX_WORD_LENGTH << " chars";
}
}
}
newPattern = buildCombinedPattern(newWords);
}
// Atomic swap under lock
{
std::unique_lock<std::shared_mutex> lock(mutex_);
censoredWords_ = std::move(newWords);
combinedPattern_ = std::move(newPattern);
}
LOG_INFO << "Successfully fetched " << censoredWords_.size() << " censored words from backend";
} catch (const std::exception& e) {
LOG_ERROR << "Error parsing async censored words response: " << e.what();
}
}, 10.0);
}
std::optional<std::regex> CensorService::buildCombinedPattern(const std::vector<std::string>& words) {
if (words.empty()) {
return std::nullopt;
}
try {
// Build combined pattern: \b(word1|word2|word3)\b
std::string pattern = "\\b(";
bool first = true;
for (const auto& word : words) {
if (!first) {
pattern += "|";
}
first = false;
// Escape special regex characters
for (char c : word) {
if (c == '.' || c == '^' || c == '$' || c == '*' || c == '+' ||
c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
c == '{' || c == '}' || c == '|' || c == '\\') {
pattern += '\\';
}
pattern += c;
}
}
pattern += ")\\b";
return std::regex(pattern, std::regex_constants::icase);
} catch (const std::regex_error& e) {
LOG_ERROR << "Failed to build combined censored pattern: " << e.what();
return std::nullopt;
}
}
std::string CensorService::censor(const std::string& text) {
if (text.empty()) {
return text;
}
std::shared_lock<std::shared_mutex> lock(mutex_);
if (!combinedPattern_) {
return text;
}
std::string result;
try {
// Replace censored words with asterisks
std::sregex_iterator begin(text.begin(), text.end(), *combinedPattern_);
std::sregex_iterator end;
size_t lastPos = 0;
for (std::sregex_iterator it = begin; it != end; ++it) {
const std::smatch& match = *it;
// Append text before match
result += text.substr(lastPos, match.position() - lastPos);
// Replace match with fixed asterisks
result += "****";
lastPos = match.position() + match.length();
}
// Append remaining text
result += text.substr(lastPos);
} catch (const std::regex_error& e) {
LOG_ERROR << "Regex replace error: " << e.what();
return text;
}
return result;
}
bool CensorService::containsCensoredWords(const std::string& text) {
if (text.empty()) {
return false;
}
std::shared_lock<std::shared_mutex> lock(mutex_);
if (!combinedPattern_) {
return false;
}
try {
return std::regex_search(text, *combinedPattern_);
} catch (const std::regex_error& e) {
LOG_ERROR << "Regex search error: " << e.what();
return false;
}
}
} // namespace services