Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
57.29% |
110 / 192 |
|
66.67% |
6 / 9 |
CRAP | |
0.00% |
0 / 1 |
| SearchUtils | |
57.29% |
110 / 192 |
|
66.67% |
6 / 9 |
164.64 | |
0.00% |
0 / 1 |
| getSearchResults | |
0.00% |
0 / 73 |
|
0.00% |
0 / 1 |
90 | |||
| sqlExecute | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| getDateFormattings | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
| getDateSql | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
| getCutout | |
100.00% |
49 / 49 |
|
100.00% |
1 / 1 |
12 | |||
| censorEmails | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getOffsets | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
5 | |||
| highlight | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
4 | |||
| normalizeRanges | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Olz\Suche\Utils; |
| 4 | |
| 5 | use Doctrine\ORM\Query\ResultSetMapping; |
| 6 | use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden; |
| 7 | use Olz\Apps\Commands\Components\OlzCommands\OlzCommands; |
| 8 | use Olz\Apps\Files\Components\OlzFiles\OlzFiles; |
| 9 | use Olz\Apps\Logs\Components\OlzLogs\OlzLogs; |
| 10 | use Olz\Apps\Members\Components\OlzMembers\OlzMembers; |
| 11 | use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring; |
| 12 | use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter; |
| 13 | use Olz\Apps\Oev\Components\OlzOev\OlzOev; |
| 14 | use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024; |
| 15 | use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All; |
| 16 | use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks; |
| 17 | use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz; |
| 18 | use Olz\Apps\Results\Components\OlzResults\OlzResults; |
| 19 | use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines; |
| 20 | use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics; |
| 21 | use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube; |
| 22 | use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion; |
| 23 | use Olz\Components\Common\OlzRootComponent; |
| 24 | use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap; |
| 25 | use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz; |
| 26 | use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger; |
| 27 | use Olz\Components\OtherPages\OlzMaterial\OlzMaterial; |
| 28 | use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail; |
| 29 | use Olz\Faq\Components\OlzFaqList\OlzFaqList; |
| 30 | use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail; |
| 31 | use Olz\Karten\Components\OlzKarten\OlzKarten; |
| 32 | use Olz\News\Components\OlzNewsDetail\OlzNewsDetail; |
| 33 | use Olz\News\Components\OlzNewsList\OlzNewsList; |
| 34 | use Olz\Roles\Components\OlzRolePage\OlzRolePage; |
| 35 | use Olz\Roles\Components\OlzVerein\OlzVerein; |
| 36 | use Olz\Service\Components\OlzService\OlzService; |
| 37 | use Olz\Startseite\Components\OlzStartseite\OlzStartseite; |
| 38 | use Olz\Suche\Components\OlzSuche\OlzSuche; |
| 39 | use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail; |
| 40 | use Olz\Termine\Components\OlzTermineList\OlzTermineList; |
| 41 | use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail; |
| 42 | use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList; |
| 43 | use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail; |
| 44 | use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList; |
| 45 | use Olz\Users\Components\OlzUserDetail\OlzUserDetail; |
| 46 | use Olz\Utils\WithUtilsTrait; |
| 47 | |
| 48 | /** |
| 49 | * @phpstan-type SearchResult array{ |
| 50 | * link: non-empty-string, |
| 51 | * icon: ?non-empty-string, |
| 52 | * date: ?\DateTime, |
| 53 | * title: non-empty-string, |
| 54 | * text: ?non-empty-string, |
| 55 | * score: float, |
| 56 | * debug: string, |
| 57 | * } |
| 58 | */ |
| 59 | class SearchUtils { |
| 60 | use WithUtilsTrait; |
| 61 | |
| 62 | /** @var array<class-string<OlzRootComponent<mixed>>> */ |
| 63 | protected static array $all_page_classes = [ |
| 64 | // All classes that extend `OlzRootComponent` should be listed here: |
| 65 | OlzAnmelden::class, |
| 66 | OlzCommands::class, |
| 67 | OlzFiles::class, |
| 68 | OlzLogs::class, |
| 69 | OlzMembers::class, |
| 70 | OlzMonitoring::class, |
| 71 | OlzNewsletter::class, |
| 72 | OlzOev::class, |
| 73 | OlzPanini2024::class, |
| 74 | OlzPanini2024All::class, |
| 75 | OlzPanini2024Masks::class, |
| 76 | OlzQuiz::class, |
| 77 | OlzResults::class, |
| 78 | OlzSearchEngines::class, |
| 79 | OlzStatistics::class, |
| 80 | OlzYoutube::class, |
| 81 | OlzEmailReaktion::class, |
| 82 | OlzHtmlSitemap::class, |
| 83 | OlzDatenschutz::class, |
| 84 | OlzFuerEinsteiger::class, |
| 85 | OlzMaterial::class, |
| 86 | OlzFaqDetail::class, |
| 87 | OlzFaqList::class, |
| 88 | OlzKarteDetail::class, |
| 89 | OlzKarten::class, |
| 90 | OlzNewsDetail::class, |
| 91 | OlzNewsList::class, |
| 92 | OlzRolePage::class, |
| 93 | OlzVerein::class, |
| 94 | OlzService::class, |
| 95 | OlzStartseite::class, |
| 96 | OlzSuche::class, |
| 97 | OlzTerminDetail::class, |
| 98 | OlzTermineList::class, |
| 99 | OlzTerminLocationDetail::class, |
| 100 | OlzTerminLocationsList::class, |
| 101 | OlzTerminTemplateDetail::class, |
| 102 | OlzTerminTemplatesList::class, |
| 103 | OlzUserDetail::class, |
| 104 | ]; |
| 105 | |
| 106 | /** |
| 107 | * @param array<string> $terms |
| 108 | * |
| 109 | * @return array<SearchResult> |
| 110 | */ |
| 111 | public function getSearchResults(array $terms): array { |
| 112 | $db = $this->dbUtils()->getDb(); |
| 113 | $num_terms = count($terms); |
| 114 | $esc_terms = array_map(fn ($term) => $db->real_escape_string($term), $terms); |
| 115 | $page_queries = []; |
| 116 | $page_withs = []; |
| 117 | foreach (self::$all_page_classes as $page_class) { |
| 118 | $page = new $page_class(); |
| 119 | $result = $page->searchSql($esc_terms); |
| 120 | if (is_array($result)) { |
| 121 | $page_queries[] = $result['query']; |
| 122 | foreach ($result['with'] as $with) { |
| 123 | $page_withs[] = $with; |
| 124 | } |
| 125 | } elseif (is_string($result)) { |
| 126 | $page_queries[] = $result; |
| 127 | } // else ignore |
| 128 | } |
| 129 | $page_withs_sql = implode(',', $page_withs); |
| 130 | $merged_sql = implode(' UNION ALL ', $page_queries); |
| 131 | |
| 132 | $idx = 0; |
| 133 | $term_evaluation_sqls = []; |
| 134 | $term_evaluation_columns = []; |
| 135 | foreach ($terms as $term) { |
| 136 | $esc_term = $db->real_escape_string(preg_quote($term)); |
| 137 | $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}') AS term{$idx}_any"; |
| 138 | $term_evaluation_columns[] = "term{$idx}_any"; |
| 139 | // Add preference for word matches |
| 140 | $term_evaluation_sqls[] = "get_quality(concatted_content, '(?=\\\\W|^){$esc_term}') AS term{$idx}_prefix"; |
| 141 | $term_evaluation_columns[] = "term{$idx}_prefix"; |
| 142 | $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}(?=\\\\W|$)') AS term{$idx}_suffix"; |
| 143 | $term_evaluation_columns[] = "term{$idx}_suffix"; |
| 144 | $idx++; |
| 145 | } |
| 146 | // Add preference to term combination matches |
| 147 | for ($num_combined = 2; $num_combined <= min(3, $num_terms); $num_combined++) { |
| 148 | for ($start_combined = 0; $start_combined <= $num_terms - $num_combined; $start_combined++) { |
| 149 | $combined_terms = array_slice($terms, $start_combined, $num_combined); |
| 150 | $esc_combined_terms = $db->real_escape_string(implode('(\W{0,5}|\s*)', array_map( |
| 151 | fn ($term) => preg_quote($term), |
| 152 | $combined_terms |
| 153 | ))); |
| 154 | $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_combined_terms}') AS terms_{$start_combined}_{$num_combined}"; |
| 155 | $term_evaluation_columns[] = "terms_{$start_combined}_{$num_combined}"; |
| 156 | // TODO: Combined date formattings? |
| 157 | } |
| 158 | } |
| 159 | $term_evaluation_sql = implode(',', $term_evaluation_sqls); |
| 160 | $term_quality_sql = implode('+', $term_evaluation_columns); |
| 161 | $rsm = new ResultSetMapping(); |
| 162 | $rsm->addScalarResult('link', 'link', 'string'); |
| 163 | $rsm->addScalarResult('icon', 'icon', 'string'); |
| 164 | $rsm->addScalarResult('date', 'date', 'date'); |
| 165 | $rsm->addScalarResult('title', 'title', 'string'); |
| 166 | $rsm->addScalarResult('text', 'text', 'string'); |
| 167 | $rsm->addScalarResult('score', 'score', 'string'); |
| 168 | $rsm->addScalarResult('time_relevance', 'time_relevance', 'string'); |
| 169 | $this->sqlExecute('DROP FUNCTION IF EXISTS get_quality'); |
| 170 | $this->sqlExecute(<<<'ZZZZZZZZZZ' |
| 171 | CREATE FUNCTION get_quality (content TEXT, regex TEXT) |
| 172 | RETURNS FLOAT DETERMINISTIC |
| 173 | RETURN (LENGTH(content) - LENGTH(REGEXP_REPLACE(content, regex, ''))) / SQRT(LENGTH(content)) |
| 174 | ZZZZZZZZZZ); |
| 175 | $sql = <<<ZZZZZZZZZZ |
| 176 | WITH |
| 177 | {$page_withs_sql}, |
| 178 | merged AS ({$merged_sql}), |
| 179 | concatted AS ( |
| 180 | SELECT |
| 181 | *, |
| 182 | CONCAT(IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(text, '')) AS concatted_content, |
| 183 | LEAST(1.0, GREATEST(0.0, time_relevance)) AS norm_time_relevance |
| 184 | FROM merged |
| 185 | ), |
| 186 | evaluated AS ( |
| 187 | SELECT |
| 188 | *, |
| 189 | {$term_evaluation_sql} |
| 190 | FROM concatted |
| 191 | ), |
| 192 | scored AS ( |
| 193 | SELECT |
| 194 | *, |
| 195 | ( |
| 196 | (1 - 1 / (1 + ({$term_quality_sql}))) * norm_time_relevance |
| 197 | ) AS score |
| 198 | FROM evaluated |
| 199 | ) |
| 200 | SELECT * |
| 201 | FROM scored |
| 202 | WHERE score > 0 |
| 203 | ORDER BY score DESC |
| 204 | ZZZZZZZZZZ; |
| 205 | $this->log()->debug("Search SQL: {$sql}"); |
| 206 | $sql_results = $this->entityManager()->createNativeQuery($sql, $rsm)->getArrayResult(); |
| 207 | return array_map(function ($row) use ($terms) { |
| 208 | return [ |
| 209 | 'link' => $row['link'], |
| 210 | 'icon' => $row['icon'], |
| 211 | 'date' => $row['date'], |
| 212 | 'title' => $row['title'], |
| 213 | 'text' => $this->searchUtils()->getCutout(strip_tags($row['text'] ?? ''), $terms) ?: null, |
| 214 | 'score' => round($row['score'], 5), |
| 215 | 'debug' => implode(' / ', [ |
| 216 | 'Score: '.round($row['score'], 5), |
| 217 | 'Time relevance: '.round($row['time_relevance'], 5), |
| 218 | ]), |
| 219 | ]; |
| 220 | }, $sql_results); |
| 221 | } |
| 222 | |
| 223 | protected function sqlExecute(string $sql): void { |
| 224 | $this->entityManager()->createNativeQuery($sql, new ResultSetMapping())->execute(); |
| 225 | } |
| 226 | |
| 227 | /** @return array<string> */ |
| 228 | public function getDateFormattings(?\DateTime $date): array { |
| 229 | if ($date === null) { |
| 230 | return []; |
| 231 | } |
| 232 | return [ |
| 233 | $date->format('Y-m-d'), |
| 234 | $date->format('d.m.Y'), |
| 235 | $date->format('j.n.Y'), |
| 236 | ]; |
| 237 | } |
| 238 | |
| 239 | public function getDateSql(string $field, string $term): ?string { |
| 240 | $result = $this->dateUtils()->parseDateTimeRange($term); |
| 241 | if ($result === null) { |
| 242 | return null; |
| 243 | } |
| 244 | return <<<ZZZZZZZZZZ |
| 245 | ( |
| 246 | {$field} >= '{$result['start']->format('Y-m-d')}' |
| 247 | AND {$field} < '{$result['end']->format('Y-m-d')}' |
| 248 | ) |
| 249 | ZZZZZZZZZZ; |
| 250 | } |
| 251 | |
| 252 | /** @param array<string> $search_terms */ |
| 253 | public function getCutout(string $text, array $search_terms, int $size = 100): string { |
| 254 | $text = $this->censorEmails($text); |
| 255 | $offsets_by_term = $this->getOffsets($text, $search_terms); |
| 256 | |
| 257 | $text_length = mb_strlen($text); |
| 258 | $term_lengths = []; |
| 259 | $term_scores = []; |
| 260 | foreach ($search_terms as $search_term) { |
| 261 | $term_length = mb_strlen($search_term); |
| 262 | $term_lengths[] = $term_length; |
| 263 | $term_scores[] = log($term_length) + 1; |
| 264 | } |
| 265 | |
| 266 | $all_end_offsets = []; |
| 267 | for ($i = 0; $i < count($offsets_by_term); $i++) { |
| 268 | $term_length = $term_lengths[$i]; |
| 269 | foreach ($offsets_by_term[$i] as $offset) { |
| 270 | $all_end_offsets[] = $offset + $term_length; |
| 271 | } |
| 272 | } |
| 273 | $all_end_offsets[] = $text_length; |
| 274 | sort($all_end_offsets); |
| 275 | |
| 276 | $best_cutout_start = 0; |
| 277 | $best_cutout_end = 0; |
| 278 | $best_cutout_score = 0; |
| 279 | $start_idxs = array_map(fn () => -1, $search_terms); |
| 280 | $end_idxs = array_map(fn () => -1, $search_terms); |
| 281 | $after_all = $text_length + 1; |
| 282 | foreach ($all_end_offsets as $offset) { |
| 283 | $start = $offset; |
| 284 | $score = 1; |
| 285 | for ($i = 0; $i < count($search_terms); $i++) { |
| 286 | $term_length = $term_lengths[$i]; |
| 287 | $offsets = $offsets_by_term[$i]; |
| 288 | while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) { |
| 289 | $end_idxs[$i]++; |
| 290 | } |
| 291 | while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) { |
| 292 | $start_idxs[$i]++; |
| 293 | } |
| 294 | $num = $end_idxs[$i] - $start_idxs[$i]; |
| 295 | $term_score = $term_scores[$i]; |
| 296 | $score *= ($num * $term_score) + 1; |
| 297 | if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) { |
| 298 | $start = $offsets[$start_idxs[$i] + 1]; |
| 299 | } |
| 300 | } |
| 301 | if ($score > $best_cutout_score) { |
| 302 | $best_cutout_score = $score; |
| 303 | $best_cutout_start = $start; |
| 304 | $best_cutout_end = $offset; |
| 305 | } |
| 306 | } |
| 307 | $best_cutout_length = $best_cutout_end - $best_cutout_start; |
| 308 | $margin_size = ($size - $best_cutout_length) / 2; |
| 309 | $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size))); |
| 310 | return implode('', [ |
| 311 | $offset === 0 ? '' : '…', |
| 312 | trim(mb_substr($text, $offset, $size)), |
| 313 | ($offset + $size >= $text_length) ? '' : '…', |
| 314 | ]); |
| 315 | } |
| 316 | |
| 317 | public function censorEmails(string $text): string { |
| 318 | return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? ''; |
| 319 | } |
| 320 | |
| 321 | /** |
| 322 | * @param array<string> $search_terms |
| 323 | * |
| 324 | * @return array<array<int>> |
| 325 | */ |
| 326 | public function getOffsets(string $text, array $search_terms): array { |
| 327 | $text_length = mb_strlen($text); |
| 328 | $offsets_by_term = []; |
| 329 | foreach ($search_terms as $search_term) { |
| 330 | $term_length = mb_strlen($search_term); |
| 331 | $term_regex = preg_quote($search_term, '/'); |
| 332 | $parts = preg_split("/({$term_regex})/ui", $text) ?: []; |
| 333 | $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts); |
| 334 | unset($parts); |
| 335 | $offsets = []; |
| 336 | $offset = 0; |
| 337 | $sanity_check = false; |
| 338 | foreach ($part_lengths as $part_length) { |
| 339 | $offset += $part_length; |
| 340 | $sanity_check = $offset === $text_length; |
| 341 | if (!$sanity_check) { // Don't add the last offset; it's just the text length |
| 342 | $offsets[] = $offset; |
| 343 | } |
| 344 | $offset += $term_length; |
| 345 | } |
| 346 | assert($sanity_check, 'Cutout offset sanity check failed'); |
| 347 | $offsets_by_term[] = $offsets; |
| 348 | } |
| 349 | return $offsets_by_term; |
| 350 | } |
| 351 | |
| 352 | /** @param array<string> $search_terms */ |
| 353 | public function highlight(string $text, array $search_terms): string { |
| 354 | $offsets_by_term = $this->getOffsets($text, $search_terms); |
| 355 | $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms); |
| 356 | |
| 357 | $ranges = []; |
| 358 | for ($i = 0; $i < count($offsets_by_term); $i++) { |
| 359 | $term_length = $term_lengths[$i]; |
| 360 | foreach ($offsets_by_term[$i] as $offset) { |
| 361 | $ranges[] = [$offset, $offset + $term_length]; |
| 362 | } |
| 363 | } |
| 364 | $merged_ranges = $this->normalizeRanges($ranges); |
| 365 | |
| 366 | $out = ''; |
| 367 | $start_tag = '<span class="highlight">'; |
| 368 | $end_tag = '</span>'; |
| 369 | $last_end = 0; |
| 370 | foreach ($merged_ranges as $range) { |
| 371 | $out .= mb_substr($text, $last_end, $range[0] - $last_end); |
| 372 | $out .= $start_tag; |
| 373 | $out .= mb_substr($text, $range[0], $range[1] - $range[0]); |
| 374 | $out .= $end_tag; |
| 375 | $last_end = $range[1]; |
| 376 | } |
| 377 | $out .= mb_substr($text, $last_end); |
| 378 | return $out; |
| 379 | } |
| 380 | |
| 381 | /** |
| 382 | * @param array<array{0:int, 1:int}> $ranges |
| 383 | * |
| 384 | * @return array<array{0:int, 1:int}> |
| 385 | */ |
| 386 | public function normalizeRanges(array $ranges): array { |
| 387 | usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]); |
| 388 | $normalized_ranges = []; |
| 389 | $num_ranges = count($ranges); |
| 390 | $i = 0; |
| 391 | while ($i < $num_ranges) { |
| 392 | $range = $ranges[$i]; |
| 393 | $start = $range[0]; |
| 394 | $end = $range[1]; |
| 395 | while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range |
| 396 | $end = max($end, $ranges[$i + 1][1]); |
| 397 | $i++; |
| 398 | } |
| 399 | $normalized_ranges[] = [$start, $end]; |
| 400 | $i++; |
| 401 | } |
| 402 | return $normalized_ranges; |
| 403 | } |
| 404 | } |