Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 169 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
| SearchUtils | |
0.00% |
0 / 169 |
|
0.00% |
0 / 13 |
1892 | |
0.00% |
0 / 1 |
| getSearchResults | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
| getPageSearchResults | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
| getDateCriteria | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| getStaticSearchResults | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
| getScoredSearchResult | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
| getDateFormattings | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| analyze | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
| getCutout | |
0.00% |
0 / 49 |
|
0.00% |
0 / 1 |
156 | |||
| censorEmails | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| getOffsets | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
30 | |||
| highlight | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
20 | |||
| normalizeRanges | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
| fromEnv | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Olz\Suche\Utils; |
| 4 | |
| 5 | use Doctrine\Common\Collections\Criteria; |
| 6 | use Doctrine\Common\Collections\Expr\Expression; |
| 7 | use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden; |
| 8 | use Olz\Apps\Commands\Components\OlzCommands\OlzCommands; |
| 9 | use Olz\Apps\Files\Components\OlzFiles\OlzFiles; |
| 10 | use Olz\Apps\Logs\Components\OlzLogs\OlzLogs; |
| 11 | use Olz\Apps\Members\Components\OlzMembers\OlzMembers; |
| 12 | use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring; |
| 13 | use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter; |
| 14 | use Olz\Apps\Oev\Components\OlzOev\OlzOev; |
| 15 | use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024; |
| 16 | use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All; |
| 17 | use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks; |
| 18 | use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz; |
| 19 | use Olz\Apps\Results\Components\OlzResults\OlzResults; |
| 20 | use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines; |
| 21 | use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics; |
| 22 | use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube; |
| 23 | use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion; |
| 24 | use Olz\Components\Common\OlzRootComponent; |
| 25 | use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap; |
| 26 | use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz; |
| 27 | use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger; |
| 28 | use Olz\Components\OtherPages\OlzMaterial\OlzMaterial; |
| 29 | use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail; |
| 30 | use Olz\Faq\Components\OlzFaqList\OlzFaqList; |
| 31 | use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail; |
| 32 | use Olz\Karten\Components\OlzKarten\OlzKarten; |
| 33 | use Olz\News\Components\OlzNewsDetail\OlzNewsDetail; |
| 34 | use Olz\News\Components\OlzNewsList\OlzNewsList; |
| 35 | use Olz\Roles\Components\OlzRolePage\OlzRolePage; |
| 36 | use Olz\Roles\Components\OlzVerein\OlzVerein; |
| 37 | use Olz\Service\Components\OlzService\OlzService; |
| 38 | use Olz\Startseite\Components\OlzStartseite\OlzStartseite; |
| 39 | use Olz\Suche\Components\OlzSuche\OlzSuche; |
| 40 | use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail; |
| 41 | use Olz\Termine\Components\OlzTermineList\OlzTermineList; |
| 42 | use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail; |
| 43 | use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList; |
| 44 | use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail; |
| 45 | use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList; |
| 46 | use Olz\Users\Components\OlzUserDetail\OlzUserDetail; |
| 47 | use Olz\Utils\WithUtilsTrait; |
| 48 | |
| 49 | /** |
| 50 | * @phpstan-type SearchResult array{ |
| 51 | * score: float, |
| 52 | * link: non-empty-string, |
| 53 | * icon: ?non-empty-string, |
| 54 | * date: ?\DateTime, |
| 55 | * title: non-empty-string, |
| 56 | * text: ?non-empty-string, |
| 57 | * } |
| 58 | * @phpstan-type PageSearchResults array{ |
| 59 | * title: non-empty-string, |
| 60 | * bestScore: ?float, |
| 61 | * results: array<SearchResult>, |
| 62 | * } |
| 63 | */ |
| 64 | class SearchUtils { |
| 65 | use WithUtilsTrait; |
| 66 | |
| 67 | /** @var array<class-string<OlzRootComponent<mixed>>> */ |
| 68 | protected static array $all_page_classes = [ |
| 69 | // All classes that extend `OlzRootComponent` should be listed here: |
| 70 | OlzAnmelden::class, |
| 71 | OlzCommands::class, |
| 72 | OlzFiles::class, |
| 73 | OlzLogs::class, |
| 74 | OlzMembers::class, |
| 75 | OlzMonitoring::class, |
| 76 | OlzNewsletter::class, |
| 77 | OlzOev::class, |
| 78 | OlzPanini2024::class, |
| 79 | OlzPanini2024All::class, |
| 80 | OlzPanini2024Masks::class, |
| 81 | OlzQuiz::class, |
| 82 | OlzResults::class, |
| 83 | OlzSearchEngines::class, |
| 84 | OlzStatistics::class, |
| 85 | OlzYoutube::class, |
| 86 | OlzEmailReaktion::class, |
| 87 | OlzHtmlSitemap::class, |
| 88 | OlzDatenschutz::class, |
| 89 | OlzFuerEinsteiger::class, |
| 90 | OlzMaterial::class, |
| 91 | OlzFaqDetail::class, |
| 92 | OlzFaqList::class, |
| 93 | OlzKarteDetail::class, |
| 94 | OlzKarten::class, |
| 95 | OlzNewsDetail::class, |
| 96 | OlzNewsList::class, |
| 97 | OlzRolePage::class, |
| 98 | OlzVerein::class, |
| 99 | OlzService::class, |
| 100 | OlzStartseite::class, |
| 101 | OlzSuche::class, |
| 102 | OlzTerminDetail::class, |
| 103 | OlzTermineList::class, |
| 104 | OlzTerminLocationDetail::class, |
| 105 | OlzTerminLocationsList::class, |
| 106 | OlzTerminTemplateDetail::class, |
| 107 | OlzTerminTemplatesList::class, |
| 108 | OlzUserDetail::class, |
| 109 | ]; |
| 110 | |
| 111 | /** |
| 112 | * @param array<string> $terms |
| 113 | * |
| 114 | * @return array<PageSearchResults> |
| 115 | */ |
| 116 | public function getSearchResults(array $terms): array { |
| 117 | $results = []; |
| 118 | foreach (self::$all_page_classes as $page_class) { |
| 119 | $results[] = $this->getPageSearchResults($page_class, $terms); |
| 120 | } |
| 121 | usort($results, fn ($a, $b) => $b['bestScore'] <=> $a['bestScore']); |
| 122 | return $results; |
| 123 | } |
| 124 | |
| 125 | /** |
| 126 | * @param class-string<OlzRootComponent<array<string, mixed>>> $page_class |
| 127 | * @param array<string> $terms |
| 128 | * |
| 129 | * @return PageSearchResults |
| 130 | */ |
| 131 | protected function getPageSearchResults(string $page_class, array $terms): array { |
| 132 | $page = new $page_class(); |
| 133 | $results = $page->getSearchResults($terms); |
| 134 | usort($results, fn ($a, $b) => $b['score'] <=> $a['score']); |
| 135 | $first_result = $results[0] ?? null; |
| 136 | $best_score = $first_result['score'] ?? null; |
| 137 | return [ |
| 138 | 'title' => $page->getSearchTitle(), |
| 139 | 'bestScore' => $best_score, |
| 140 | 'results' => $results, |
| 141 | ]; |
| 142 | } |
| 143 | |
| 144 | /** @return array<Expression> */ |
| 145 | public function getDateCriteria(string $field, string $term): array { |
| 146 | $result = $this->dateUtils()->parseDateTimeRange($term); |
| 147 | if ($result === null) { |
| 148 | return []; |
| 149 | } |
| 150 | return [Criteria::expr()->andX( |
| 151 | Criteria::expr()->gte($field, $result['start']), |
| 152 | Criteria::expr()->lt($field, $result['end']), |
| 153 | )]; |
| 154 | } |
| 155 | |
| 156 | /** |
| 157 | * @param array<string> $terms |
| 158 | * @param array{ |
| 159 | * link: non-empty-string, |
| 160 | * icon?: ?non-empty-string, |
| 161 | * date?: ?\DateTime, |
| 162 | * title: non-empty-string, |
| 163 | * } $defaults |
| 164 | * |
| 165 | * @return array<SearchResult> |
| 166 | */ |
| 167 | public function getStaticSearchResults( |
| 168 | string $content, |
| 169 | array $terms, |
| 170 | array $defaults, |
| 171 | ): array { |
| 172 | $search_space = "{$content} {$defaults['title']}"; |
| 173 | $analysis = $this->analyze($search_space, $defaults['date'] ?? null, $terms); |
| 174 | if (!$analysis['hasAll']) { |
| 175 | return []; |
| 176 | } |
| 177 | return [ |
| 178 | [ |
| 179 | 'score' => $analysis['score'], |
| 180 | 'icon' => null, |
| 181 | 'date' => null, |
| 182 | 'text' => $this->searchUtils()->getCutout($content, $terms) ?: null, |
| 183 | ...$defaults, |
| 184 | ], |
| 185 | ]; |
| 186 | } |
| 187 | |
| 188 | /** |
| 189 | * @param array{ |
| 190 | * link: non-empty-string, |
| 191 | * icon?: ?non-empty-string, |
| 192 | * date?: ?\DateTime, |
| 193 | * title: non-empty-string, |
| 194 | * text?: ?non-empty-string, |
| 195 | * } $result |
| 196 | * @param array<string> $terms |
| 197 | * |
| 198 | * @return SearchResult |
| 199 | */ |
| 200 | public function getScoredSearchResult( |
| 201 | array $result, |
| 202 | array $terms, |
| 203 | ): array { |
| 204 | $text_str = $result['text'] ?? ''; |
| 205 | $search_space = "{$text_str} {$result['title']}"; |
| 206 | $analysis = $this->analyze($search_space, $result['date'] ?? null, $terms); |
| 207 | return [ |
| 208 | 'icon' => null, |
| 209 | 'date' => null, |
| 210 | ...$result, |
| 211 | 'score' => $analysis['score'], |
| 212 | 'text' => $this->searchUtils()->getCutout($text_str, $terms) ?: null, |
| 213 | ]; |
| 214 | } |
| 215 | |
| 216 | /** @return array<string> */ |
| 217 | public function getDateFormattings(?\DateTime $date): array { |
| 218 | if ($date === null) { |
| 219 | return []; |
| 220 | } |
| 221 | return [ |
| 222 | $date->format('Y-m-d'), |
| 223 | $date->format('d.m.Y'), |
| 224 | $date->format('j.n.Y'), |
| 225 | ]; |
| 226 | } |
| 227 | |
| 228 | /** |
| 229 | * @param array<string> $terms |
| 230 | * |
| 231 | * @return array{score: float, hasAll: bool} |
| 232 | */ |
| 233 | public function analyze(string $content, ?\DateTime $date, array $terms): array { |
| 234 | $date_formattings = implode(' ', $this->getDateFormattings($date)); |
| 235 | $has_all = true; |
| 236 | $sum_occurrences = 0; |
| 237 | foreach ($terms as $term) { |
| 238 | $esc_term = preg_quote($term); |
| 239 | $num_occurrences = preg_match_all("/{$esc_term}/i", $content, $matches); |
| 240 | if (preg_match("/{$esc_term}/i", $date_formattings)) { |
| 241 | $num_occurrences++; |
| 242 | } |
| 243 | $sum_occurrences += $num_occurrences; |
| 244 | if (!$num_occurrences) { |
| 245 | $has_all = false; |
| 246 | } |
| 247 | } |
| 248 | $score = round(1 - (1 / ($sum_occurrences / count($terms) + 1)), 5); |
| 249 | return ['score' => $score, 'hasAll' => $has_all]; |
| 250 | } |
| 251 | |
| 252 | /** @param array<string> $search_terms */ |
| 253 | public function getCutout(string $text, array $search_terms, int $size = 100): string { |
| 254 | $text = $this->censorEmails($text); |
| 255 | $offsets_by_term = $this->getOffsets($text, $search_terms); |
| 256 | |
| 257 | $text_length = mb_strlen($text); |
| 258 | $term_lengths = []; |
| 259 | $term_scores = []; |
| 260 | foreach ($search_terms as $search_term) { |
| 261 | $term_length = mb_strlen($search_term); |
| 262 | $term_lengths[] = $term_length; |
| 263 | $term_scores[] = log($term_length) + 1; |
| 264 | } |
| 265 | |
| 266 | $all_end_offsets = []; |
| 267 | for ($i = 0; $i < count($offsets_by_term); $i++) { |
| 268 | $term_length = $term_lengths[$i]; |
| 269 | foreach ($offsets_by_term[$i] as $offset) { |
| 270 | $all_end_offsets[] = $offset + $term_length; |
| 271 | } |
| 272 | } |
| 273 | $all_end_offsets[] = $text_length; |
| 274 | sort($all_end_offsets); |
| 275 | |
| 276 | $best_cutout_start = 0; |
| 277 | $best_cutout_end = 0; |
| 278 | $best_cutout_score = 0; |
| 279 | $start_idxs = array_map(fn () => -1, $search_terms); |
| 280 | $end_idxs = array_map(fn () => -1, $search_terms); |
| 281 | $after_all = $text_length + 1; |
| 282 | foreach ($all_end_offsets as $offset) { |
| 283 | $start = $offset; |
| 284 | $score = 1; |
| 285 | for ($i = 0; $i < count($search_terms); $i++) { |
| 286 | $term_length = $term_lengths[$i]; |
| 287 | $offsets = $offsets_by_term[$i]; |
| 288 | while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) { |
| 289 | $end_idxs[$i]++; |
| 290 | } |
| 291 | while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) { |
| 292 | $start_idxs[$i]++; |
| 293 | } |
| 294 | $num = $end_idxs[$i] - $start_idxs[$i]; |
| 295 | $term_score = $term_scores[$i]; |
| 296 | $score *= ($num * $term_score) + 1; |
| 297 | if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) { |
| 298 | $start = $offsets[$start_idxs[$i] + 1]; |
| 299 | } |
| 300 | } |
| 301 | if ($score > $best_cutout_score) { |
| 302 | $best_cutout_score = $score; |
| 303 | $best_cutout_start = $start; |
| 304 | $best_cutout_end = $offset; |
| 305 | } |
| 306 | } |
| 307 | $best_cutout_length = $best_cutout_end - $best_cutout_start; |
| 308 | $margin_size = ($size - $best_cutout_length) / 2; |
| 309 | $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size))); |
| 310 | return implode('', [ |
| 311 | $offset === 0 ? '' : '…', |
| 312 | trim(mb_substr($text, $offset, $size)), |
| 313 | ($offset + $size >= $text_length) ? '' : '…', |
| 314 | ]); |
| 315 | } |
| 316 | |
| 317 | public function censorEmails(string $text): string { |
| 318 | return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? ''; |
| 319 | } |
| 320 | |
| 321 | /** |
| 322 | * @param array<string> $search_terms |
| 323 | * |
| 324 | * @return array<array<int>> |
| 325 | */ |
| 326 | public function getOffsets(string $text, array $search_terms): array { |
| 327 | $text_length = mb_strlen($text); |
| 328 | $offsets_by_term = []; |
| 329 | foreach ($search_terms as $search_term) { |
| 330 | $term_length = mb_strlen($search_term); |
| 331 | $term_regex = preg_quote($search_term, '/'); |
| 332 | $parts = preg_split("/({$term_regex})/ui", $text) ?: []; |
| 333 | $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts); |
| 334 | unset($parts); |
| 335 | $offsets = []; |
| 336 | $offset = 0; |
| 337 | $sanity_check = false; |
| 338 | foreach ($part_lengths as $part_length) { |
| 339 | $offset += $part_length; |
| 340 | $sanity_check = $offset === $text_length; |
| 341 | if (!$sanity_check) { // Don't add the last offset; it's just the text length |
| 342 | $offsets[] = $offset; |
| 343 | } |
| 344 | $offset += $term_length; |
| 345 | } |
| 346 | assert($sanity_check, 'Cutout offset sanity check failed'); |
| 347 | $offsets_by_term[] = $offsets; |
| 348 | } |
| 349 | return $offsets_by_term; |
| 350 | } |
| 351 | |
| 352 | /** @param array<string> $search_terms */ |
| 353 | public function highlight(string $text, array $search_terms): string { |
| 354 | $offsets_by_term = $this->getOffsets($text, $search_terms); |
| 355 | $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms); |
| 356 | |
| 357 | $ranges = []; |
| 358 | for ($i = 0; $i < count($offsets_by_term); $i++) { |
| 359 | $term_length = $term_lengths[$i]; |
| 360 | foreach ($offsets_by_term[$i] as $offset) { |
| 361 | $ranges[] = [$offset, $offset + $term_length]; |
| 362 | } |
| 363 | } |
| 364 | $merged_ranges = $this->normalizeRanges($ranges); |
| 365 | |
| 366 | $out = ''; |
| 367 | $start_tag = '<span class="highlight">'; |
| 368 | $end_tag = '</span>'; |
| 369 | $last_end = 0; |
| 370 | foreach ($merged_ranges as $range) { |
| 371 | $out .= mb_substr($text, $last_end, $range[0] - $last_end); |
| 372 | $out .= $start_tag; |
| 373 | $out .= mb_substr($text, $range[0], $range[1] - $range[0]); |
| 374 | $out .= $end_tag; |
| 375 | $last_end = $range[1]; |
| 376 | } |
| 377 | $out .= mb_substr($text, $last_end); |
| 378 | return $out; |
| 379 | } |
| 380 | |
| 381 | /** |
| 382 | * @param array<array{0:int, 1:int}> $ranges |
| 383 | * |
| 384 | * @return array<array{0:int, 1:int}> |
| 385 | */ |
| 386 | public function normalizeRanges(array $ranges): array { |
| 387 | usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]); |
| 388 | $normalized_ranges = []; |
| 389 | $num_ranges = count($ranges); |
| 390 | $i = 0; |
| 391 | while ($i < $num_ranges) { |
| 392 | $range = $ranges[$i]; |
| 393 | $start = $range[0]; |
| 394 | $end = $range[1]; |
| 395 | while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range |
| 396 | $end = max($end, $ranges[$i + 1][1]); |
| 397 | $i++; |
| 398 | } |
| 399 | $normalized_ranges[] = [$start, $end]; |
| 400 | $i++; |
| 401 | } |
| 402 | return $normalized_ranges; |
| 403 | } |
| 404 | |
| 405 | public static function fromEnv(): self { |
| 406 | return new self(); |
| 407 | } |
| 408 | } |