Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.45% covered (success)
99.45%
180 / 181
92.31% covered (success)
92.31%
12 / 13
CRAP
0.00% covered (danger)
0.00%
0 / 1
SearchUtils
99.45% covered (success)
99.45%
180 / 181
92.31% covered (success)
92.31%
12 / 13
45
0.00% covered (danger)
0.00%
0 / 1
 getSearchResults
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 getPageSearchResults
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 getDateCriteria
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 getStaticSearchResults
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
3
 getScoredSearchResult
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 getDateFormattings
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 analyze
100.00% covered (success)
100.00%
25 / 25
100.00% covered (success)
100.00%
1 / 1
6
 getCutout
100.00% covered (success)
100.00%
49 / 49
100.00% covered (success)
100.00%
1 / 1
12
 censorEmails
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getOffsets
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 highlight
100.00% covered (success)
100.00%
20 / 20
100.00% covered (success)
100.00%
1 / 1
4
 normalizeRanges
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 fromEnv
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Olz\Suche\Utils;
4
5use Doctrine\Common\Collections\Criteria;
6use Doctrine\Common\Collections\Expr\Expression;
7use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden;
8use Olz\Apps\Commands\Components\OlzCommands\OlzCommands;
9use Olz\Apps\Files\Components\OlzFiles\OlzFiles;
10use Olz\Apps\Logs\Components\OlzLogs\OlzLogs;
11use Olz\Apps\Members\Components\OlzMembers\OlzMembers;
12use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring;
13use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter;
14use Olz\Apps\Oev\Components\OlzOev\OlzOev;
15use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024;
16use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All;
17use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks;
18use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz;
19use Olz\Apps\Results\Components\OlzResults\OlzResults;
20use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines;
21use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics;
22use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube;
23use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion;
24use Olz\Components\Common\OlzRootComponent;
25use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap;
26use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz;
27use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger;
28use Olz\Components\OtherPages\OlzMaterial\OlzMaterial;
29use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail;
30use Olz\Faq\Components\OlzFaqList\OlzFaqList;
31use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail;
32use Olz\Karten\Components\OlzKarten\OlzKarten;
33use Olz\News\Components\OlzNewsDetail\OlzNewsDetail;
34use Olz\News\Components\OlzNewsList\OlzNewsList;
35use Olz\Roles\Components\OlzRolePage\OlzRolePage;
36use Olz\Roles\Components\OlzVerein\OlzVerein;
37use Olz\Service\Components\OlzService\OlzService;
38use Olz\Startseite\Components\OlzStartseite\OlzStartseite;
39use Olz\Suche\Components\OlzSuche\OlzSuche;
40use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail;
41use Olz\Termine\Components\OlzTermineList\OlzTermineList;
42use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail;
43use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList;
44use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail;
45use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList;
46use Olz\Users\Components\OlzUserDetail\OlzUserDetail;
47use Olz\Utils\WithUtilsTrait;
48
49/**
50 * @phpstan-type SearchResult array{
51 *   score: float,
52 *   link: non-empty-string,
53 *   icon: ?non-empty-string,
54 *   date: ?\DateTime,
55 *   title: non-empty-string,
56 *   text: ?non-empty-string,
57 * }
58 * @phpstan-type PageSearchResults array{
59 *   title: non-empty-string,
60 *   bestScore: ?float,
61 *   results: array<SearchResult>,
62 * }
63 */
64class SearchUtils {
65    use WithUtilsTrait;
66
67    /** @var array<class-string<OlzRootComponent<mixed>>> */
68    protected static array $all_page_classes = [
69        // All classes that extend `OlzRootComponent` should be listed here:
70        OlzAnmelden::class,
71        OlzCommands::class,
72        OlzFiles::class,
73        OlzLogs::class,
74        OlzMembers::class,
75        OlzMonitoring::class,
76        OlzNewsletter::class,
77        OlzOev::class,
78        OlzPanini2024::class,
79        OlzPanini2024All::class,
80        OlzPanini2024Masks::class,
81        OlzQuiz::class,
82        OlzResults::class,
83        OlzSearchEngines::class,
84        OlzStatistics::class,
85        OlzYoutube::class,
86        OlzEmailReaktion::class,
87        OlzHtmlSitemap::class,
88        OlzDatenschutz::class,
89        OlzFuerEinsteiger::class,
90        OlzMaterial::class,
91        OlzFaqDetail::class,
92        OlzFaqList::class,
93        OlzKarteDetail::class,
94        OlzKarten::class,
95        OlzNewsDetail::class,
96        OlzNewsList::class,
97        OlzRolePage::class,
98        OlzVerein::class,
99        OlzService::class,
100        OlzStartseite::class,
101        OlzSuche::class,
102        OlzTerminDetail::class,
103        OlzTermineList::class,
104        OlzTerminLocationDetail::class,
105        OlzTerminLocationsList::class,
106        OlzTerminTemplateDetail::class,
107        OlzTerminTemplatesList::class,
108        OlzUserDetail::class,
109    ];
110
111    /**
112     * @param array<string> $terms
113     *
114     * @return array<PageSearchResults>
115     */
116    public function getSearchResults(array $terms): array {
117        $results = [];
118        foreach (self::$all_page_classes as $page_class) {
119            $results[] = $this->getPageSearchResults($page_class, $terms);
120        }
121        usort($results, fn ($a, $b) => $b['bestScore'] <=> $a['bestScore']);
122        return $results;
123    }
124
125    /**
126     * @param class-string<OlzRootComponent<array<string, mixed>>> $page_class
127     * @param array<string>                                        $terms
128     *
129     * @return PageSearchResults
130     */
131    protected function getPageSearchResults(string $page_class, array $terms): array {
132        $page = new $page_class();
133        $results = $page->getSearchResults($terms);
134        usort($results, fn ($a, $b) => $b['score'] <=> $a['score']);
135        $first_result = $results[0] ?? null;
136        $best_score = $first_result['score'] ?? null;
137        return [
138            'title' => $page->getSearchTitle(),
139            'bestScore' => $best_score,
140            'results' => $results,
141        ];
142    }
143
144    /** @return array<Expression> */
145    public function getDateCriteria(string $field, string $term): array {
146        $result = $this->dateUtils()->parseDateTimeRange($term);
147        if ($result === null) {
148            return [];
149        }
150        return [Criteria::expr()->andX(
151            Criteria::expr()->gte($field, $result['start']),
152            Criteria::expr()->lt($field, $result['end']),
153        )];
154    }
155
156    /**
157     * @param array<string> $terms
158     * @param array{
159     *   link: non-empty-string,
160     *   icon?: ?non-empty-string,
161     *   date?: ?\DateTime,
162     *   title: non-empty-string,
163     * } $defaults
164     *
165     * @return array<SearchResult>
166     */
167    public function getStaticSearchResults(
168        string $content,
169        array $terms,
170        array $defaults,
171    ): array {
172        $search_space = "{$content} {$defaults['title']}";
173        $analysis = $this->analyze($search_space, $defaults['date'] ?? null, $terms);
174        if (!$analysis['hasAll']) {
175            return [];
176        }
177        return [
178            [
179                'score' => $analysis['score'],
180                'icon' => null,
181                'date' => null,
182                'text' => $this->searchUtils()->getCutout($content, $terms) ?: null,
183                ...$defaults,
184            ],
185        ];
186    }
187
188    /**
189     * @param array{
190     *   link: non-empty-string,
191     *   icon?: ?non-empty-string,
192     *   date?: ?\DateTime,
193     *   title: non-empty-string,
194     *   text?: ?non-empty-string,
195     * } $result
196     * @param array<string> $terms
197     *
198     * @return SearchResult
199     */
200    public function getScoredSearchResult(
201        array $result,
202        array $terms,
203    ): array {
204        $text_str = $result['text'] ?? '';
205        // Count title matches double
206        $search_space = "{$result['title']} {$text_str} {$result['title']}";
207        $analysis = $this->analyze($search_space, $result['date'] ?? null, $terms);
208        return [
209            'icon' => null,
210            'date' => null,
211            ...$result,
212            'score' => $analysis['score'],
213            'text' => $this->searchUtils()->getCutout($text_str, $terms) ?: null,
214        ];
215    }
216
217    /** @return array<string> */
218    public function getDateFormattings(?\DateTime $date): array {
219        if ($date === null) {
220            return [];
221        }
222        return [
223            $date->format('Y-m-d'),
224            $date->format('d.m.Y'),
225            $date->format('j.n.Y'),
226        ];
227    }
228
229    /**
230     * @param array<string> $terms
231     *
232     * @return array{score: float, hasAll: bool}
233     */
234    public function analyze(string $content, ?\DateTime $date, array $terms): array {
235        $date_formattings = implode(' ', $this->getDateFormattings($date));
236        $has_all = true;
237        $sum_occurrences = 0;
238        foreach ($terms as $term) {
239            $esc_term = preg_quote($term);
240            $num_occurrences = preg_match_all("/{$esc_term}/i", $content);
241            // Add preference to full-word/start-of-word/end-of-word matches
242            $num_occurrences += preg_match_all("/(\\W|^){$esc_term}/i", $content);
243            $num_occurrences += preg_match_all("/{$esc_term}(\\W|$)/i", $content);
244            if (preg_match("/{$esc_term}/i", $date_formattings)) {
245                $num_occurrences++;
246            }
247            $sum_occurrences += $num_occurrences;
248            if (!$num_occurrences) {
249                $has_all = false;
250            }
251        }
252        $num_terms = count($terms);
253        // Add preference to term combination matches
254        for ($num_combined = 2; $num_combined <= min(4, $num_terms); $num_combined++) {
255            for ($start_combined = 0; $start_combined <= $num_terms - $num_combined; $start_combined++) {
256                $combined_terms = array_slice($terms, $start_combined, $num_combined);
257                $esc_combined_terms = implode('(\W{0,5}|\s*)', array_map(
258                    fn ($term) => preg_quote($term),
259                    $combined_terms
260                ));
261                $num_occurrences = preg_match_all("/{$esc_combined_terms}/i", $content);
262                // TODO: Combined date formattings?
263                $sum_occurrences += $num_occurrences * $num_combined;
264            }
265        }
266        $score = round(1 - (1 / ($sum_occurrences / $num_terms + 1)), 5);
267        return ['score' => $score, 'hasAll' => $has_all];
268    }
269
270    /** @param array<string> $search_terms */
271    public function getCutout(string $text, array $search_terms, int $size = 100): string {
272        $text = $this->censorEmails($text);
273        $offsets_by_term = $this->getOffsets($text, $search_terms);
274
275        $text_length = mb_strlen($text);
276        $term_lengths = [];
277        $term_scores = [];
278        foreach ($search_terms as $search_term) {
279            $term_length = mb_strlen($search_term);
280            $term_lengths[] = $term_length;
281            $term_scores[] = log($term_length) + 1;
282        }
283
284        $all_end_offsets = [];
285        for ($i = 0; $i < count($offsets_by_term); $i++) {
286            $term_length = $term_lengths[$i];
287            foreach ($offsets_by_term[$i] as $offset) {
288                $all_end_offsets[] = $offset + $term_length;
289            }
290        }
291        $all_end_offsets[] = $text_length;
292        sort($all_end_offsets);
293
294        $best_cutout_start = 0;
295        $best_cutout_end = 0;
296        $best_cutout_score = 0;
297        $start_idxs = array_map(fn () => -1, $search_terms);
298        $end_idxs = array_map(fn () => -1, $search_terms);
299        $after_all = $text_length + 1;
300        foreach ($all_end_offsets as $offset) {
301            $start = $offset;
302            $score = 1;
303            for ($i = 0; $i < count($search_terms); $i++) {
304                $term_length = $term_lengths[$i];
305                $offsets = $offsets_by_term[$i];
306                while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) {
307                    $end_idxs[$i]++;
308                }
309                while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) {
310                    $start_idxs[$i]++;
311                }
312                $num = $end_idxs[$i] - $start_idxs[$i];
313                $term_score = $term_scores[$i];
314                $score *= ($num * $term_score) + 1;
315                if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) {
316                    $start = $offsets[$start_idxs[$i] + 1];
317                }
318            }
319            if ($score > $best_cutout_score) {
320                $best_cutout_score = $score;
321                $best_cutout_start = $start;
322                $best_cutout_end = $offset;
323            }
324        }
325        $best_cutout_length = $best_cutout_end - $best_cutout_start;
326        $margin_size = ($size - $best_cutout_length) / 2;
327        $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size)));
328        return implode('', [
329            $offset === 0 ? '' : '…',
330            trim(mb_substr($text, $offset, $size)),
331            ($offset + $size >= $text_length) ? '' : '…',
332        ]);
333    }
334
335    public function censorEmails(string $text): string {
336        return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? '';
337    }
338
339    /**
340     * @param array<string> $search_terms
341     *
342     * @return array<array<int>>
343     */
344    public function getOffsets(string $text, array $search_terms): array {
345        $text_length = mb_strlen($text);
346        $offsets_by_term = [];
347        foreach ($search_terms as $search_term) {
348            $term_length = mb_strlen($search_term);
349            $term_regex = preg_quote($search_term, '/');
350            $parts = preg_split("/({$term_regex})/ui", $text) ?: [];
351            $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts);
352            unset($parts);
353            $offsets = [];
354            $offset = 0;
355            $sanity_check = false;
356            foreach ($part_lengths as $part_length) {
357                $offset += $part_length;
358                $sanity_check = $offset === $text_length;
359                if (!$sanity_check) { // Don't add the last offset; it's just the text length
360                    $offsets[] = $offset;
361                }
362                $offset += $term_length;
363            }
364            assert($sanity_check, 'Cutout offset sanity check failed');
365            $offsets_by_term[] = $offsets;
366        }
367        return $offsets_by_term;
368    }
369
370    /** @param array<string> $search_terms */
371    public function highlight(string $text, array $search_terms): string {
372        $offsets_by_term = $this->getOffsets($text, $search_terms);
373        $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms);
374
375        $ranges = [];
376        for ($i = 0; $i < count($offsets_by_term); $i++) {
377            $term_length = $term_lengths[$i];
378            foreach ($offsets_by_term[$i] as $offset) {
379                $ranges[] = [$offset, $offset + $term_length];
380            }
381        }
382        $merged_ranges = $this->normalizeRanges($ranges);
383
384        $out = '';
385        $start_tag = '<span class="highlight">';
386        $end_tag = '</span>';
387        $last_end = 0;
388        foreach ($merged_ranges as $range) {
389            $out .= mb_substr($text, $last_end, $range[0] - $last_end);
390            $out .= $start_tag;
391            $out .= mb_substr($text, $range[0], $range[1] - $range[0]);
392            $out .= $end_tag;
393            $last_end = $range[1];
394        }
395        $out .= mb_substr($text, $last_end);
396        return $out;
397    }
398
399    /**
400     * @param array<array{0:int, 1:int}> $ranges
401     *
402     * @return array<array{0:int, 1:int}>
403     */
404    public function normalizeRanges(array $ranges): array {
405        usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]);
406        $normalized_ranges = [];
407        $num_ranges = count($ranges);
408        $i = 0;
409        while ($i < $num_ranges) {
410            $range = $ranges[$i];
411            $start = $range[0];
412            $end = $range[1];
413            while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range
414                $end = max($end, $ranges[$i + 1][1]);
415                $i++;
416            }
417            $normalized_ranges[] = [$start, $end];
418            $i++;
419        }
420        return $normalized_ranges;
421    }
422
423    public static function fromEnv(): self {
424        return new self();
425    }
426}