Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.41% covered (success)
99.41%
168 / 169
92.31% covered (success)
92.31%
12 / 13
CRAP
0.00% covered (danger)
0.00%
0 / 1
SearchUtils
99.41% covered (success)
99.41%
168 / 169
92.31% covered (success)
92.31%
12 / 13
43
0.00% covered (danger)
0.00%
0 / 1
 getSearchResults
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 getPageSearchResults
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 getDateCriteria
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 getStaticSearchResults
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
3
 getScoredSearchResult
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 getDateFormattings
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 analyze
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
4
 getCutout
100.00% covered (success)
100.00%
49 / 49
100.00% covered (success)
100.00%
1 / 1
12
 censorEmails
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getOffsets
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 highlight
100.00% covered (success)
100.00%
20 / 20
100.00% covered (success)
100.00%
1 / 1
4
 normalizeRanges
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 fromEnv
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Olz\Suche\Utils;
4
5use Doctrine\Common\Collections\Criteria;
6use Doctrine\Common\Collections\Expr\Expression;
7use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden;
8use Olz\Apps\Commands\Components\OlzCommands\OlzCommands;
9use Olz\Apps\Files\Components\OlzFiles\OlzFiles;
10use Olz\Apps\Logs\Components\OlzLogs\OlzLogs;
11use Olz\Apps\Members\Components\OlzMembers\OlzMembers;
12use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring;
13use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter;
14use Olz\Apps\Oev\Components\OlzOev\OlzOev;
15use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024;
16use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All;
17use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks;
18use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz;
19use Olz\Apps\Results\Components\OlzResults\OlzResults;
20use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines;
21use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics;
22use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube;
23use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion;
24use Olz\Components\Common\OlzRootComponent;
25use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap;
26use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz;
27use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger;
28use Olz\Components\OtherPages\OlzMaterial\OlzMaterial;
29use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail;
30use Olz\Faq\Components\OlzFaqList\OlzFaqList;
31use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail;
32use Olz\Karten\Components\OlzKarten\OlzKarten;
33use Olz\News\Components\OlzNewsDetail\OlzNewsDetail;
34use Olz\News\Components\OlzNewsList\OlzNewsList;
35use Olz\Roles\Components\OlzRolePage\OlzRolePage;
36use Olz\Roles\Components\OlzVerein\OlzVerein;
37use Olz\Service\Components\OlzService\OlzService;
38use Olz\Startseite\Components\OlzStartseite\OlzStartseite;
39use Olz\Suche\Components\OlzSuche\OlzSuche;
40use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail;
41use Olz\Termine\Components\OlzTermineList\OlzTermineList;
42use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail;
43use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList;
44use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail;
45use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList;
46use Olz\Users\Components\OlzUserDetail\OlzUserDetail;
47use Olz\Utils\WithUtilsTrait;
48
49/**
50 * @phpstan-type SearchResult array{
51 *   score: float,
52 *   link: non-empty-string,
53 *   icon: ?non-empty-string,
54 *   date: ?\DateTime,
55 *   title: non-empty-string,
56 *   text: ?non-empty-string,
57 * }
58 * @phpstan-type PageSearchResults array{
59 *   title: non-empty-string,
60 *   bestScore: ?float,
61 *   results: array<SearchResult>,
62 * }
63 */
64class SearchUtils {
65    use WithUtilsTrait;
66
67    /** @var array<class-string<OlzRootComponent<mixed>>> */
68    protected static array $all_page_classes = [
69        // All classes that extend `OlzRootComponent` should be listed here:
70        OlzAnmelden::class,
71        OlzCommands::class,
72        OlzFiles::class,
73        OlzLogs::class,
74        OlzMembers::class,
75        OlzMonitoring::class,
76        OlzNewsletter::class,
77        OlzOev::class,
78        OlzPanini2024::class,
79        OlzPanini2024All::class,
80        OlzPanini2024Masks::class,
81        OlzQuiz::class,
82        OlzResults::class,
83        OlzSearchEngines::class,
84        OlzStatistics::class,
85        OlzYoutube::class,
86        OlzEmailReaktion::class,
87        OlzHtmlSitemap::class,
88        OlzDatenschutz::class,
89        OlzFuerEinsteiger::class,
90        OlzMaterial::class,
91        OlzFaqDetail::class,
92        OlzFaqList::class,
93        OlzKarteDetail::class,
94        OlzKarten::class,
95        OlzNewsDetail::class,
96        OlzNewsList::class,
97        OlzRolePage::class,
98        OlzVerein::class,
99        OlzService::class,
100        OlzStartseite::class,
101        OlzSuche::class,
102        OlzTerminDetail::class,
103        OlzTermineList::class,
104        OlzTerminLocationDetail::class,
105        OlzTerminLocationsList::class,
106        OlzTerminTemplateDetail::class,
107        OlzTerminTemplatesList::class,
108        OlzUserDetail::class,
109    ];
110
111    /**
112     * @param array<string> $terms
113     *
114     * @return array<PageSearchResults>
115     */
116    public function getSearchResults(array $terms): array {
117        $results = [];
118        foreach (self::$all_page_classes as $page_class) {
119            $results[] = $this->getPageSearchResults($page_class, $terms);
120        }
121        usort($results, fn ($a, $b) => $b['bestScore'] <=> $a['bestScore']);
122        return $results;
123    }
124
125    /**
126     * @param class-string<OlzRootComponent<array<string, mixed>>> $page_class
127     * @param array<string>                                        $terms
128     *
129     * @return PageSearchResults
130     */
131    protected function getPageSearchResults(string $page_class, array $terms): array {
132        $page = new $page_class();
133        $results = $page->getSearchResults($terms);
134        usort($results, fn ($a, $b) => $b['score'] <=> $a['score']);
135        $first_result = $results[0] ?? null;
136        $best_score = $first_result['score'] ?? null;
137        return [
138            'title' => $page->getSearchTitle(),
139            'bestScore' => $best_score,
140            'results' => $results,
141        ];
142    }
143
144    /** @return array<Expression> */
145    public function getDateCriteria(string $field, string $term): array {
146        $result = $this->dateUtils()->parseDateTimeRange($term);
147        if ($result === null) {
148            return [];
149        }
150        return [Criteria::expr()->andX(
151            Criteria::expr()->gte($field, $result['start']),
152            Criteria::expr()->lt($field, $result['end']),
153        )];
154    }
155
156    /**
157     * @param array<string> $terms
158     * @param array{
159     *   link: non-empty-string,
160     *   icon?: ?non-empty-string,
161     *   date?: ?\DateTime,
162     *   title: non-empty-string,
163     * } $defaults
164     *
165     * @return array<SearchResult>
166     */
167    public function getStaticSearchResults(
168        string $content,
169        array $terms,
170        array $defaults,
171    ): array {
172        $search_space = "{$content} {$defaults['title']}";
173        $analysis = $this->analyze($search_space, $defaults['date'] ?? null, $terms);
174        if (!$analysis['hasAll']) {
175            return [];
176        }
177        return [
178            [
179                'score' => $analysis['score'],
180                'icon' => null,
181                'date' => null,
182                'text' => $this->searchUtils()->getCutout($content, $terms) ?: null,
183                ...$defaults,
184            ],
185        ];
186    }
187
188    /**
189     * @param array{
190     *   link: non-empty-string,
191     *   icon?: ?non-empty-string,
192     *   date?: ?\DateTime,
193     *   title: non-empty-string,
194     *   text?: ?non-empty-string,
195     * } $result
196     * @param array<string> $terms
197     *
198     * @return SearchResult
199     */
200    public function getScoredSearchResult(
201        array $result,
202        array $terms,
203    ): array {
204        $text_str = $result['text'] ?? '';
205        $search_space = "{$text_str} {$result['title']}";
206        $analysis = $this->analyze($search_space, $result['date'] ?? null, $terms);
207        return [
208            'icon' => null,
209            'date' => null,
210            ...$result,
211            'score' => $analysis['score'],
212            'text' => $this->searchUtils()->getCutout($text_str, $terms) ?: null,
213        ];
214    }
215
216    /** @return array<string> */
217    public function getDateFormattings(?\DateTime $date): array {
218        if ($date === null) {
219            return [];
220        }
221        return [
222            $date->format('Y-m-d'),
223            $date->format('d.m.Y'),
224            $date->format('j.n.Y'),
225        ];
226    }
227
228    /**
229     * @param array<string> $terms
230     *
231     * @return array{score: float, hasAll: bool}
232     */
233    public function analyze(string $content, ?\DateTime $date, array $terms): array {
234        $date_formattings = implode(' ', $this->getDateFormattings($date));
235        $has_all = true;
236        $sum_occurrences = 0;
237        foreach ($terms as $term) {
238            $esc_term = preg_quote($term);
239            $num_occurrences = preg_match_all("/{$esc_term}/i", $content, $matches);
240            if (preg_match("/{$esc_term}/i", $date_formattings)) {
241                $num_occurrences++;
242            }
243            $sum_occurrences += $num_occurrences;
244            if (!$num_occurrences) {
245                $has_all = false;
246            }
247        }
248        $score = round(1 - (1 / ($sum_occurrences / count($terms) + 1)), 5);
249        return ['score' => $score, 'hasAll' => $has_all];
250    }
251
252    /** @param array<string> $search_terms */
253    public function getCutout(string $text, array $search_terms, int $size = 100): string {
254        $text = $this->censorEmails($text);
255        $offsets_by_term = $this->getOffsets($text, $search_terms);
256
257        $text_length = mb_strlen($text);
258        $term_lengths = [];
259        $term_scores = [];
260        foreach ($search_terms as $search_term) {
261            $term_length = mb_strlen($search_term);
262            $term_lengths[] = $term_length;
263            $term_scores[] = log($term_length) + 1;
264        }
265
266        $all_end_offsets = [];
267        for ($i = 0; $i < count($offsets_by_term); $i++) {
268            $term_length = $term_lengths[$i];
269            foreach ($offsets_by_term[$i] as $offset) {
270                $all_end_offsets[] = $offset + $term_length;
271            }
272        }
273        $all_end_offsets[] = $text_length;
274        sort($all_end_offsets);
275
276        $best_cutout_start = 0;
277        $best_cutout_end = 0;
278        $best_cutout_score = 0;
279        $start_idxs = array_map(fn () => -1, $search_terms);
280        $end_idxs = array_map(fn () => -1, $search_terms);
281        $after_all = $text_length + 1;
282        foreach ($all_end_offsets as $offset) {
283            $start = $offset;
284            $score = 1;
285            for ($i = 0; $i < count($search_terms); $i++) {
286                $term_length = $term_lengths[$i];
287                $offsets = $offsets_by_term[$i];
288                while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) {
289                    $end_idxs[$i]++;
290                }
291                while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) {
292                    $start_idxs[$i]++;
293                }
294                $num = $end_idxs[$i] - $start_idxs[$i];
295                $term_score = $term_scores[$i];
296                $score *= ($num * $term_score) + 1;
297                if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) {
298                    $start = $offsets[$start_idxs[$i] + 1];
299                }
300            }
301            if ($score > $best_cutout_score) {
302                $best_cutout_score = $score;
303                $best_cutout_start = $start;
304                $best_cutout_end = $offset;
305            }
306        }
307        $best_cutout_length = $best_cutout_end - $best_cutout_start;
308        $margin_size = ($size - $best_cutout_length) / 2;
309        $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size)));
310        return implode('', [
311            $offset === 0 ? '' : '…',
312            trim(mb_substr($text, $offset, $size)),
313            ($offset + $size >= $text_length) ? '' : '…',
314        ]);
315    }
316
317    public function censorEmails(string $text): string {
318        return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? '';
319    }
320
321    /**
322     * @param array<string> $search_terms
323     *
324     * @return array<array<int>>
325     */
326    public function getOffsets(string $text, array $search_terms): array {
327        $text_length = mb_strlen($text);
328        $offsets_by_term = [];
329        foreach ($search_terms as $search_term) {
330            $term_length = mb_strlen($search_term);
331            $term_regex = preg_quote($search_term, '/');
332            $parts = preg_split("/({$term_regex})/ui", $text) ?: [];
333            $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts);
334            unset($parts);
335            $offsets = [];
336            $offset = 0;
337            $sanity_check = false;
338            foreach ($part_lengths as $part_length) {
339                $offset += $part_length;
340                $sanity_check = $offset === $text_length;
341                if (!$sanity_check) { // Don't add the last offset; it's just the text length
342                    $offsets[] = $offset;
343                }
344                $offset += $term_length;
345            }
346            assert($sanity_check, 'Cutout offset sanity check failed');
347            $offsets_by_term[] = $offsets;
348        }
349        return $offsets_by_term;
350    }
351
352    /** @param array<string> $search_terms */
353    public function highlight(string $text, array $search_terms): string {
354        $offsets_by_term = $this->getOffsets($text, $search_terms);
355        $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms);
356
357        $ranges = [];
358        for ($i = 0; $i < count($offsets_by_term); $i++) {
359            $term_length = $term_lengths[$i];
360            foreach ($offsets_by_term[$i] as $offset) {
361                $ranges[] = [$offset, $offset + $term_length];
362            }
363        }
364        $merged_ranges = $this->normalizeRanges($ranges);
365
366        $out = '';
367        $start_tag = '<span class="highlight">';
368        $end_tag = '</span>';
369        $last_end = 0;
370        foreach ($merged_ranges as $range) {
371            $out .= mb_substr($text, $last_end, $range[0] - $last_end);
372            $out .= $start_tag;
373            $out .= mb_substr($text, $range[0], $range[1] - $range[0]);
374            $out .= $end_tag;
375            $last_end = $range[1];
376        }
377        $out .= mb_substr($text, $last_end);
378        return $out;
379    }
380
381    /**
382     * @param array<array{0:int, 1:int}> $ranges
383     *
384     * @return array<array{0:int, 1:int}>
385     */
386    public function normalizeRanges(array $ranges): array {
387        usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]);
388        $normalized_ranges = [];
389        $num_ranges = count($ranges);
390        $i = 0;
391        while ($i < $num_ranges) {
392            $range = $ranges[$i];
393            $start = $range[0];
394            $end = $range[1];
395            while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range
396                $end = max($end, $ranges[$i + 1][1]);
397                $i++;
398            }
399            $normalized_ranges[] = [$start, $end];
400            $i++;
401        }
402        return $normalized_ranges;
403    }
404
405    public static function fromEnv(): self {
406        return new self();
407    }
408}