Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
69.80% |
171 / 245 |
|
83.33% |
10 / 12 |
CRAP | |
0.00% |
0 / 1 |
| SearchUtils | |
69.80% |
171 / 245 |
|
83.33% |
10 / 12 |
115.16 | |
0.00% |
0 / 1 |
| getSearchResults | |
0.00% |
0 / 73 |
|
0.00% |
0 / 1 |
90 | |||
| sqlExecute | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| getDateFormattings | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
| getDateSql | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
| getSnippetsWhereSql | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
| getStaticResultQuery | |
100.00% |
30 / 30 |
|
100.00% |
1 / 1 |
2 | |||
| unionAllQueries | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
5 | |||
| getCutout | |
100.00% |
49 / 49 |
|
100.00% |
1 / 1 |
12 | |||
| censorEmails | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getOffsets | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
5 | |||
| highlight | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
4 | |||
| normalizeRanges | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Olz\Suche\Utils; |
| 4 | |
| 5 | use Doctrine\ORM\Query\ResultSetMapping; |
| 6 | use Olz\Anniversary\Components\OlzAnniversary\OlzAnniversary; |
| 7 | use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden; |
| 8 | use Olz\Apps\Commands\Components\OlzCommands\OlzCommands; |
| 9 | use Olz\Apps\Files\Components\OlzFiles\OlzFiles; |
| 10 | use Olz\Apps\Logs\Components\OlzLogs\OlzLogs; |
| 11 | use Olz\Apps\Members\Components\OlzMembers\OlzMembers; |
| 12 | use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring; |
| 13 | use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter; |
| 14 | use Olz\Apps\Oev\Components\OlzOev\OlzOev; |
| 15 | use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024; |
| 16 | use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All; |
| 17 | use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks; |
| 18 | use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz; |
| 19 | use Olz\Apps\Results\Components\OlzResults\OlzResults; |
| 20 | use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines; |
| 21 | use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics; |
| 22 | use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube; |
| 23 | use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion; |
| 24 | use Olz\Components\Common\OlzRootComponent; |
| 25 | use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap; |
| 26 | use Olz\Components\OtherPages\OlzAngebot\OlzAngebot; |
| 27 | use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz; |
| 28 | use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger; |
| 29 | use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail; |
| 30 | use Olz\Faq\Components\OlzFaqList\OlzFaqList; |
| 31 | use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail; |
| 32 | use Olz\Karten\Components\OlzKarten\OlzKarten; |
| 33 | use Olz\News\Components\OlzNewsDetail\OlzNewsDetail; |
| 34 | use Olz\News\Components\OlzNewsList\OlzNewsList; |
| 35 | use Olz\Repository\Snippets\PredefinedSnippet; |
| 36 | use Olz\Roles\Components\OlzRoleDetail\OlzRoleDetail; |
| 37 | use Olz\Roles\Components\OlzVerein\OlzVerein; |
| 38 | use Olz\Service\Components\OlzService\OlzService; |
| 39 | use Olz\Startseite\Components\OlzStartseite\OlzStartseite; |
| 40 | use Olz\Suche\Components\OlzSuche\OlzSuche; |
| 41 | use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail; |
| 42 | use Olz\Termine\Components\OlzTermineList\OlzTermineList; |
| 43 | use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail; |
| 44 | use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList; |
| 45 | use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail; |
| 46 | use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList; |
| 47 | use Olz\Users\Components\OlzUserDetail\OlzUserDetail; |
| 48 | use Olz\Utils\WithUtilsTrait; |
| 49 | |
| 50 | /** |
| 51 | * @phpstan-type SearchResult array{ |
| 52 | * link: non-empty-string, |
| 53 | * icon: ?non-empty-string, |
| 54 | * date: ?\DateTime, |
| 55 | * title: non-empty-string, |
| 56 | * text: ?non-empty-string, |
| 57 | * score: float, |
| 58 | * debug: string, |
| 59 | * } |
| 60 | * @phpstan-type StaticResult array{ |
| 61 | * link: string, |
| 62 | * icon?: ?string, |
| 63 | * date?: ?\DateTime, |
| 64 | * title: string, |
| 65 | * text?: ?string, |
| 66 | * timeRelevance?: ?float, |
| 67 | * } |
| 68 | * @phpstan-type WithQuery array{ |
| 69 | * with: array<string>, |
| 70 | * query: string, |
| 71 | * } |
| 72 | */ |
| 73 | class SearchUtils { |
| 74 | use WithUtilsTrait; |
| 75 | |
| 76 | /** @var array<class-string<OlzRootComponent<mixed>>> */ |
| 77 | protected static array $all_page_classes = [ |
| 78 | // All classes that extend `OlzRootComponent` should be listed here: |
| 79 | OlzAnmelden::class, |
| 80 | OlzAnniversary::class, |
| 81 | OlzCommands::class, |
| 82 | OlzFiles::class, |
| 83 | OlzLogs::class, |
| 84 | OlzMembers::class, |
| 85 | OlzMonitoring::class, |
| 86 | OlzNewsletter::class, |
| 87 | OlzOev::class, |
| 88 | OlzPanini2024::class, |
| 89 | OlzPanini2024All::class, |
| 90 | OlzPanini2024Masks::class, |
| 91 | OlzQuiz::class, |
| 92 | OlzResults::class, |
| 93 | OlzSearchEngines::class, |
| 94 | OlzStatistics::class, |
| 95 | OlzYoutube::class, |
| 96 | OlzEmailReaktion::class, |
| 97 | OlzHtmlSitemap::class, |
| 98 | OlzDatenschutz::class, |
| 99 | OlzFuerEinsteiger::class, |
| 100 | OlzAngebot::class, |
| 101 | OlzFaqDetail::class, |
| 102 | OlzFaqList::class, |
| 103 | OlzKarteDetail::class, |
| 104 | OlzKarten::class, |
| 105 | OlzNewsDetail::class, |
| 106 | OlzNewsList::class, |
| 107 | OlzRoleDetail::class, |
| 108 | OlzVerein::class, |
| 109 | OlzService::class, |
| 110 | OlzStartseite::class, |
| 111 | OlzSuche::class, |
| 112 | OlzTerminDetail::class, |
| 113 | OlzTermineList::class, |
| 114 | OlzTerminLocationDetail::class, |
| 115 | OlzTerminLocationsList::class, |
| 116 | OlzTerminTemplateDetail::class, |
| 117 | OlzTerminTemplatesList::class, |
| 118 | OlzUserDetail::class, |
| 119 | ]; |
| 120 | |
| 121 | /** |
| 122 | * @param array<string> $terms |
| 123 | * |
| 124 | * @return array<SearchResult> |
| 125 | */ |
| 126 | public function getSearchResults(array $terms): array { |
| 127 | $db = $this->dbUtils()->getDb(); |
| 128 | $num_terms = count($terms); |
| 129 | $esc_terms = array_map(fn ($term) => $db->real_escape_string($term), $terms); |
| 130 | $page_queries = []; |
| 131 | $page_withs = []; |
| 132 | foreach (self::$all_page_classes as $page_class) { |
| 133 | $page = new $page_class(); |
| 134 | $result = $page->searchSql($esc_terms); |
| 135 | if (is_array($result)) { |
| 136 | $page_queries[] = $result['query']; |
| 137 | foreach ($result['with'] as $with) { |
| 138 | $page_withs[] = $with; |
| 139 | } |
| 140 | } elseif (is_string($result)) { |
| 141 | $page_queries[] = $result; |
| 142 | } // else ignore |
| 143 | } |
| 144 | $page_withs_sql = implode(',', $page_withs); |
| 145 | $merged_sql = implode(' UNION ALL ', $page_queries); |
| 146 | |
| 147 | $idx = 0; |
| 148 | $term_evaluation_sqls = []; |
| 149 | $term_evaluation_columns = []; |
| 150 | foreach ($terms as $term) { |
| 151 | $esc_term = $db->real_escape_string(preg_quote($term)); |
| 152 | $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}') AS term{$idx}_any"; |
| 153 | $term_evaluation_columns[] = "term{$idx}_any"; |
| 154 | // Add preference for word matches |
| 155 | $term_evaluation_sqls[] = "get_quality(concatted_content, '(?=\\\\W|^){$esc_term}') AS term{$idx}_prefix"; |
| 156 | $term_evaluation_columns[] = "term{$idx}_prefix"; |
| 157 | $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}(?=\\\\W|$)') AS term{$idx}_suffix"; |
| 158 | $term_evaluation_columns[] = "term{$idx}_suffix"; |
| 159 | $idx++; |
| 160 | } |
| 161 | // Add preference to term combination matches |
| 162 | for ($num_combined = 2; $num_combined <= min(3, $num_terms); $num_combined++) { |
| 163 | for ($start_combined = 0; $start_combined <= $num_terms - $num_combined; $start_combined++) { |
| 164 | $combined_terms = array_slice($terms, $start_combined, $num_combined); |
| 165 | $esc_combined_terms = $db->real_escape_string(implode('(\W{0,5}|\s*)', array_map( |
| 166 | fn ($term) => preg_quote($term), |
| 167 | $combined_terms |
| 168 | ))); |
| 169 | $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_combined_terms}') AS terms_{$start_combined}_{$num_combined}"; |
| 170 | $term_evaluation_columns[] = "terms_{$start_combined}_{$num_combined}"; |
| 171 | // TODO: Combined date formattings? |
| 172 | } |
| 173 | } |
| 174 | $term_evaluation_sql = implode(',', $term_evaluation_sqls); |
| 175 | $term_quality_sql = implode('+', $term_evaluation_columns); |
| 176 | $rsm = new ResultSetMapping(); |
| 177 | $rsm->addScalarResult('link', 'link', 'string'); |
| 178 | $rsm->addScalarResult('icon', 'icon', 'string'); |
| 179 | $rsm->addScalarResult('date', 'date', 'date'); |
| 180 | $rsm->addScalarResult('title', 'title', 'string'); |
| 181 | $rsm->addScalarResult('text', 'text', 'string'); |
| 182 | $rsm->addScalarResult('score', 'score', 'string'); |
| 183 | $rsm->addScalarResult('time_relevance', 'time_relevance', 'string'); |
| 184 | $this->sqlExecute('DROP FUNCTION IF EXISTS get_quality'); |
| 185 | $this->sqlExecute(<<<'ZZZZZZZZZZ' |
| 186 | CREATE FUNCTION get_quality (content TEXT, regex TEXT) |
| 187 | RETURNS FLOAT DETERMINISTIC |
| 188 | RETURN (LENGTH(content) - LENGTH(REGEXP_REPLACE(content, regex, ''))) / SQRT(LENGTH(content)) |
| 189 | ZZZZZZZZZZ); |
| 190 | $sql = <<<ZZZZZZZZZZ |
| 191 | WITH |
| 192 | {$page_withs_sql}, |
| 193 | merged AS ({$merged_sql}), |
| 194 | concatted AS ( |
| 195 | SELECT |
| 196 | *, |
| 197 | CONCAT(IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(text, '')) AS concatted_content, |
| 198 | LEAST(1.0, GREATEST(0.0, time_relevance)) AS norm_time_relevance |
| 199 | FROM merged |
| 200 | ), |
| 201 | evaluated AS ( |
| 202 | SELECT |
| 203 | *, |
| 204 | {$term_evaluation_sql} |
| 205 | FROM concatted |
| 206 | ), |
| 207 | scored AS ( |
| 208 | SELECT |
| 209 | *, |
| 210 | ( |
| 211 | (1 - 1 / (1 + ({$term_quality_sql}))) * norm_time_relevance |
| 212 | ) AS score |
| 213 | FROM evaluated |
| 214 | ) |
| 215 | SELECT * |
| 216 | FROM scored |
| 217 | WHERE score > 0 |
| 218 | ORDER BY score DESC |
| 219 | ZZZZZZZZZZ; |
| 220 | $this->log()->debug("Search SQL: {$sql}"); |
| 221 | $sql_results = $this->entityManager()->createNativeQuery($sql, $rsm)->getArrayResult(); |
| 222 | return array_map(function ($row) use ($terms) { |
| 223 | return [ |
| 224 | 'link' => $row['link'], |
| 225 | 'icon' => $row['icon'], |
| 226 | 'date' => $row['date'], |
| 227 | 'title' => $row['title'], |
| 228 | 'text' => $this->searchUtils()->getCutout(strip_tags($row['text'] ?? ''), $terms) ?: null, |
| 229 | 'score' => round($row['score'], 5), |
| 230 | 'debug' => implode(' / ', [ |
| 231 | 'Score: '.round($row['score'], 5), |
| 232 | 'Time relevance: '.round($row['time_relevance'], 5), |
| 233 | ]), |
| 234 | ]; |
| 235 | }, $sql_results); |
| 236 | } |
| 237 | |
| 238 | protected function sqlExecute(string $sql): void { |
| 239 | $this->entityManager()->createNativeQuery($sql, new ResultSetMapping())->execute(); |
| 240 | } |
| 241 | |
| 242 | /** @return array<string> */ |
| 243 | public function getDateFormattings(?\DateTime $date): array { |
| 244 | if ($date === null) { |
| 245 | return []; |
| 246 | } |
| 247 | return [ |
| 248 | $date->format('Y-m-d'), |
| 249 | $date->format('d.m.Y'), |
| 250 | $date->format('j.n.Y'), |
| 251 | ]; |
| 252 | } |
| 253 | |
| 254 | public function getDateSql(string $field, string $term): ?string { |
| 255 | $result = $this->dateUtils()->parseDateTimeRange($term); |
| 256 | if ($result === null) { |
| 257 | return null; |
| 258 | } |
| 259 | return <<<ZZZZZZZZZZ |
| 260 | ( |
| 261 | {$field} >= '{$result['start']->format('Y-m-d')}' |
| 262 | AND {$field} < '{$result['end']->format('Y-m-d')}' |
| 263 | ) |
| 264 | ZZZZZZZZZZ; |
| 265 | } |
| 266 | |
| 267 | /** |
| 268 | * @param array<PredefinedSnippet|int<1,max>> $snippets |
| 269 | * @param array<string> $terms |
| 270 | */ |
| 271 | public function getSnippetsWhereSql(array $snippets, array $terms): ?string { |
| 272 | $terms_where = implode(' AND ', array_map(fn ($term) => "text LIKE '%{$term}%'", $terms)); |
| 273 | $ids = implode(',', array_map( |
| 274 | fn ($snippet) => is_int($snippet) ? "'{$snippet}'" : "'{$snippet->value}'", |
| 275 | $snippets, |
| 276 | )); |
| 277 | return <<<ZZZZZZZZZZ |
| 278 | on_off = '1' |
| 279 | AND {$terms_where} |
| 280 | AND id IN ({$ids}) |
| 281 | ZZZZZZZZZZ; |
| 282 | } |
| 283 | |
| 284 | /** |
| 285 | * @param StaticResult $static_result |
| 286 | * @param array<string> $terms |
| 287 | * |
| 288 | * @return WithQuery |
| 289 | */ |
| 290 | public function getStaticResultQuery(array $static_result, array $terms): array { |
| 291 | $ident = md5(json_encode($static_result) ?: ''); |
| 292 | $esc_link = $this->generalUtils()->internalSqlEscape($static_result['link']); |
| 293 | $esc_icon = $this->generalUtils()->internalNullableSqlEscape($static_result['icon'] ?? null); |
| 294 | // TODO: date |
| 295 | $esc_title = $this->generalUtils()->internalSqlEscape($static_result['title']); |
| 296 | $esc_text = $this->generalUtils()->internalNullableSqlEscape($static_result['text'] ?? null); |
| 297 | $esc_time_relevance = floatval($static_result['timeRelevance'] ?? '1.0'); |
| 298 | $static_where = implode(' AND ', array_map(function ($term) { |
| 299 | return <<<ZZZZZZZZZZ |
| 300 | ( |
| 301 | title LIKE '%{$term}%' |
| 302 | OR text LIKE '%{$term}%' |
| 303 | ) |
| 304 | ZZZZZZZZZZ; |
| 305 | }, $terms)); |
| 306 | return [ |
| 307 | 'with' => [ |
| 308 | <<<ZZZZZZZZZZ |
| 309 | static_{$ident} AS ( |
| 310 | SELECT |
| 311 | '{$esc_link}' AS link, |
| 312 | {$esc_icon} AS icon, |
| 313 | NULL AS date, |
| 314 | '{$esc_title}' AS title, |
| 315 | {$esc_text} AS text |
| 316 | ) |
| 317 | ZZZZZZZZZZ, |
| 318 | ], |
| 319 | 'query' => <<<ZZZZZZZZZZ |
| 320 | SELECT |
| 321 | link, icon, date, title, text, |
| 322 | {$esc_time_relevance} AS time_relevance |
| 323 | FROM static_{$ident} |
| 324 | WHERE {$static_where} |
| 325 | ZZZZZZZZZZ, |
| 326 | ]; |
| 327 | } |
| 328 | |
| 329 | /** |
| 330 | * @param array<WithQuery|string> $queries |
| 331 | * |
| 332 | * @return WithQuery |
| 333 | */ |
| 334 | public function unionAllQueries(array $queries): array { |
| 335 | $out_withs = []; |
| 336 | $out_queries = []; |
| 337 | foreach ($queries as $query) { |
| 338 | if (is_array($query)) { |
| 339 | $out_queries[] = $query['query']; |
| 340 | foreach ($query['with'] as $with) { |
| 341 | $out_withs[] = $with; |
| 342 | } |
| 343 | } elseif (is_string($query)) { |
| 344 | $out_queries[] = $query; |
| 345 | } // else ignore |
| 346 | } |
| 347 | return [ |
| 348 | 'with' => $out_withs, |
| 349 | 'query' => implode(' UNION ALL ', $out_queries), |
| 350 | ]; |
| 351 | } |
| 352 | |
| 353 | /** @param array<string> $search_terms */ |
| 354 | public function getCutout(string $text, array $search_terms, int $size = 100): string { |
| 355 | $text = $this->censorEmails($text); |
| 356 | $offsets_by_term = $this->getOffsets($text, $search_terms); |
| 357 | |
| 358 | $text_length = mb_strlen($text); |
| 359 | $term_lengths = []; |
| 360 | $term_scores = []; |
| 361 | foreach ($search_terms as $search_term) { |
| 362 | $term_length = mb_strlen($search_term); |
| 363 | $term_lengths[] = $term_length; |
| 364 | $term_scores[] = log($term_length) + 1; |
| 365 | } |
| 366 | |
| 367 | $all_end_offsets = []; |
| 368 | for ($i = 0; $i < count($offsets_by_term); $i++) { |
| 369 | $term_length = $term_lengths[$i]; |
| 370 | foreach ($offsets_by_term[$i] as $offset) { |
| 371 | $all_end_offsets[] = $offset + $term_length; |
| 372 | } |
| 373 | } |
| 374 | $all_end_offsets[] = $text_length; |
| 375 | sort($all_end_offsets); |
| 376 | |
| 377 | $best_cutout_start = 0; |
| 378 | $best_cutout_end = 0; |
| 379 | $best_cutout_score = 0; |
| 380 | $start_idxs = array_map(fn () => -1, $search_terms); |
| 381 | $end_idxs = array_map(fn () => -1, $search_terms); |
| 382 | $after_all = $text_length + 1; |
| 383 | foreach ($all_end_offsets as $offset) { |
| 384 | $start = $offset; |
| 385 | $score = 1; |
| 386 | for ($i = 0; $i < count($search_terms); $i++) { |
| 387 | $term_length = $term_lengths[$i]; |
| 388 | $offsets = $offsets_by_term[$i]; |
| 389 | while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) { |
| 390 | $end_idxs[$i]++; |
| 391 | } |
| 392 | while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) { |
| 393 | $start_idxs[$i]++; |
| 394 | } |
| 395 | $num = $end_idxs[$i] - $start_idxs[$i]; |
| 396 | $term_score = $term_scores[$i]; |
| 397 | $score *= ($num * $term_score) + 1; |
| 398 | if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) { |
| 399 | $start = $offsets[$start_idxs[$i] + 1]; |
| 400 | } |
| 401 | } |
| 402 | if ($score > $best_cutout_score) { |
| 403 | $best_cutout_score = $score; |
| 404 | $best_cutout_start = $start; |
| 405 | $best_cutout_end = $offset; |
| 406 | } |
| 407 | } |
| 408 | $best_cutout_length = $best_cutout_end - $best_cutout_start; |
| 409 | $margin_size = ($size - $best_cutout_length) / 2; |
| 410 | $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size))); |
| 411 | return implode('', [ |
| 412 | $offset === 0 ? '' : '…', |
| 413 | trim(mb_substr($text, $offset, $size)), |
| 414 | ($offset + $size >= $text_length) ? '' : '…', |
| 415 | ]); |
| 416 | } |
| 417 | |
| 418 | public function censorEmails(string $text): string { |
| 419 | return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? ''; |
| 420 | } |
| 421 | |
| 422 | /** |
| 423 | * @param array<string> $search_terms |
| 424 | * |
| 425 | * @return array<array<int>> |
| 426 | */ |
| 427 | public function getOffsets(string $text, array $search_terms): array { |
| 428 | $text_length = mb_strlen($text); |
| 429 | $offsets_by_term = []; |
| 430 | foreach ($search_terms as $search_term) { |
| 431 | $term_length = mb_strlen($search_term); |
| 432 | $term_regex = preg_quote($search_term, '/'); |
| 433 | $parts = preg_split("/({$term_regex})/ui", $text) ?: []; |
| 434 | $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts); |
| 435 | unset($parts); |
| 436 | $offsets = []; |
| 437 | $offset = 0; |
| 438 | $sanity_check = false; |
| 439 | foreach ($part_lengths as $part_length) { |
| 440 | $offset += $part_length; |
| 441 | $sanity_check = $offset === $text_length; |
| 442 | if (!$sanity_check) { // Don't add the last offset; it's just the text length |
| 443 | $offsets[] = $offset; |
| 444 | } |
| 445 | $offset += $term_length; |
| 446 | } |
| 447 | assert($sanity_check, 'Cutout offset sanity check failed'); |
| 448 | $offsets_by_term[] = $offsets; |
| 449 | } |
| 450 | return $offsets_by_term; |
| 451 | } |
| 452 | |
| 453 | /** @param array<string> $search_terms */ |
| 454 | public function highlight(string $text, array $search_terms): string { |
| 455 | $offsets_by_term = $this->getOffsets($text, $search_terms); |
| 456 | $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms); |
| 457 | |
| 458 | $ranges = []; |
| 459 | for ($i = 0; $i < count($offsets_by_term); $i++) { |
| 460 | $term_length = $term_lengths[$i]; |
| 461 | foreach ($offsets_by_term[$i] as $offset) { |
| 462 | $ranges[] = [$offset, $offset + $term_length]; |
| 463 | } |
| 464 | } |
| 465 | $merged_ranges = $this->normalizeRanges($ranges); |
| 466 | |
| 467 | $out = ''; |
| 468 | $start_tag = '<span class="highlight">'; |
| 469 | $end_tag = '</span>'; |
| 470 | $last_end = 0; |
| 471 | foreach ($merged_ranges as $range) { |
| 472 | $out .= mb_substr($text, $last_end, $range[0] - $last_end); |
| 473 | $out .= $start_tag; |
| 474 | $out .= mb_substr($text, $range[0], $range[1] - $range[0]); |
| 475 | $out .= $end_tag; |
| 476 | $last_end = $range[1]; |
| 477 | } |
| 478 | $out .= mb_substr($text, $last_end); |
| 479 | return $out; |
| 480 | } |
| 481 | |
| 482 | /** |
| 483 | * @param array<array{0:int, 1:int}> $ranges |
| 484 | * |
| 485 | * @return array<array{0:int, 1:int}> |
| 486 | */ |
| 487 | public function normalizeRanges(array $ranges): array { |
| 488 | usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]); |
| 489 | $normalized_ranges = []; |
| 490 | $num_ranges = count($ranges); |
| 491 | $i = 0; |
| 492 | while ($i < $num_ranges) { |
| 493 | $range = $ranges[$i]; |
| 494 | $start = $range[0]; |
| 495 | $end = $range[1]; |
| 496 | while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range |
| 497 | $end = max($end, $ranges[$i + 1][1]); |
| 498 | $i++; |
| 499 | } |
| 500 | $normalized_ranges[] = [$start, $end]; |
| 501 | $i++; |
| 502 | } |
| 503 | return $normalized_ranges; |
| 504 | } |
| 505 | } |