Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
57.29% covered (warning)
57.29%
110 / 192
66.67% covered (warning)
66.67%
6 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
SearchUtils
57.29% covered (warning)
57.29%
110 / 192
66.67% covered (warning)
66.67%
6 / 9
164.64
0.00% covered (danger)
0.00%
0 / 1
 getSearchResults
0.00% covered (danger)
0.00%
0 / 73
0.00% covered (danger)
0.00%
0 / 1
90
 sqlExecute
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getDateFormattings
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 getDateSql
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
6
 getCutout
100.00% covered (success)
100.00%
49 / 49
100.00% covered (success)
100.00%
1 / 1
12
 censorEmails
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getOffsets
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 highlight
100.00% covered (success)
100.00%
20 / 20
100.00% covered (success)
100.00%
1 / 1
4
 normalizeRanges
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
1<?php
2
3namespace Olz\Suche\Utils;
4
5use Doctrine\ORM\Query\ResultSetMapping;
6use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden;
7use Olz\Apps\Commands\Components\OlzCommands\OlzCommands;
8use Olz\Apps\Files\Components\OlzFiles\OlzFiles;
9use Olz\Apps\Logs\Components\OlzLogs\OlzLogs;
10use Olz\Apps\Members\Components\OlzMembers\OlzMembers;
11use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring;
12use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter;
13use Olz\Apps\Oev\Components\OlzOev\OlzOev;
14use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024;
15use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All;
16use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks;
17use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz;
18use Olz\Apps\Results\Components\OlzResults\OlzResults;
19use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines;
20use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics;
21use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube;
22use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion;
23use Olz\Components\Common\OlzRootComponent;
24use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap;
25use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz;
26use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger;
27use Olz\Components\OtherPages\OlzMaterial\OlzMaterial;
28use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail;
29use Olz\Faq\Components\OlzFaqList\OlzFaqList;
30use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail;
31use Olz\Karten\Components\OlzKarten\OlzKarten;
32use Olz\News\Components\OlzNewsDetail\OlzNewsDetail;
33use Olz\News\Components\OlzNewsList\OlzNewsList;
34use Olz\Roles\Components\OlzRolePage\OlzRolePage;
35use Olz\Roles\Components\OlzVerein\OlzVerein;
36use Olz\Service\Components\OlzService\OlzService;
37use Olz\Startseite\Components\OlzStartseite\OlzStartseite;
38use Olz\Suche\Components\OlzSuche\OlzSuche;
39use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail;
40use Olz\Termine\Components\OlzTermineList\OlzTermineList;
41use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail;
42use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList;
43use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail;
44use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList;
45use Olz\Users\Components\OlzUserDetail\OlzUserDetail;
46use Olz\Utils\WithUtilsTrait;
47
48/**
49 * @phpstan-type SearchResult array{
50 *   link: non-empty-string,
51 *   icon: ?non-empty-string,
52 *   date: ?\DateTime,
53 *   title: non-empty-string,
54 *   text: ?non-empty-string,
55 *   score: float,
56 *   debug: string,
57 * }
58 */
59class SearchUtils {
60    use WithUtilsTrait;
61
62    /** @var array<class-string<OlzRootComponent<mixed>>> */
63    protected static array $all_page_classes = [
64        // All classes that extend `OlzRootComponent` should be listed here:
65        OlzAnmelden::class,
66        OlzCommands::class,
67        OlzFiles::class,
68        OlzLogs::class,
69        OlzMembers::class,
70        OlzMonitoring::class,
71        OlzNewsletter::class,
72        OlzOev::class,
73        OlzPanini2024::class,
74        OlzPanini2024All::class,
75        OlzPanini2024Masks::class,
76        OlzQuiz::class,
77        OlzResults::class,
78        OlzSearchEngines::class,
79        OlzStatistics::class,
80        OlzYoutube::class,
81        OlzEmailReaktion::class,
82        OlzHtmlSitemap::class,
83        OlzDatenschutz::class,
84        OlzFuerEinsteiger::class,
85        OlzMaterial::class,
86        OlzFaqDetail::class,
87        OlzFaqList::class,
88        OlzKarteDetail::class,
89        OlzKarten::class,
90        OlzNewsDetail::class,
91        OlzNewsList::class,
92        OlzRolePage::class,
93        OlzVerein::class,
94        OlzService::class,
95        OlzStartseite::class,
96        OlzSuche::class,
97        OlzTerminDetail::class,
98        OlzTermineList::class,
99        OlzTerminLocationDetail::class,
100        OlzTerminLocationsList::class,
101        OlzTerminTemplateDetail::class,
102        OlzTerminTemplatesList::class,
103        OlzUserDetail::class,
104    ];
105
106    /**
107     * @param array<string> $terms
108     *
109     * @return array<SearchResult>
110     */
111    public function getSearchResults(array $terms): array {
112        $db = $this->dbUtils()->getDb();
113        $num_terms = count($terms);
114        $esc_terms = array_map(fn ($term) => $db->real_escape_string($term), $terms);
115        $page_queries = [];
116        $page_withs = [];
117        foreach (self::$all_page_classes as $page_class) {
118            $page = new $page_class();
119            $result = $page->searchSql($esc_terms);
120            if (is_array($result)) {
121                $page_queries[] = $result['query'];
122                foreach ($result['with'] as $with) {
123                    $page_withs[] = $with;
124                }
125            } elseif (is_string($result)) {
126                $page_queries[] = $result;
127            } // else ignore
128        }
129        $page_withs_sql = implode(',', $page_withs);
130        $merged_sql = implode(' UNION ALL ', $page_queries);
131
132        $idx = 0;
133        $term_evaluation_sqls = [];
134        $term_evaluation_columns = [];
135        foreach ($terms as $term) {
136            $esc_term = $db->real_escape_string(preg_quote($term));
137            $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}') AS term{$idx}_any";
138            $term_evaluation_columns[] = "term{$idx}_any";
139            // Add preference for word matches
140            $term_evaluation_sqls[] = "get_quality(concatted_content, '(?=\\\\W|^){$esc_term}') AS term{$idx}_prefix";
141            $term_evaluation_columns[] = "term{$idx}_prefix";
142            $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}(?=\\\\W|$)') AS term{$idx}_suffix";
143            $term_evaluation_columns[] = "term{$idx}_suffix";
144            $idx++;
145        }
146        // Add preference to term combination matches
147        for ($num_combined = 2; $num_combined <= min(3, $num_terms); $num_combined++) {
148            for ($start_combined = 0; $start_combined <= $num_terms - $num_combined; $start_combined++) {
149                $combined_terms = array_slice($terms, $start_combined, $num_combined);
150                $esc_combined_terms = $db->real_escape_string(implode('(\W{0,5}|\s*)', array_map(
151                    fn ($term) => preg_quote($term),
152                    $combined_terms
153                )));
154                $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_combined_terms}') AS terms_{$start_combined}_{$num_combined}";
155                $term_evaluation_columns[] = "terms_{$start_combined}_{$num_combined}";
156                // TODO: Combined date formattings?
157            }
158        }
159        $term_evaluation_sql = implode(',', $term_evaluation_sqls);
160        $term_quality_sql = implode('+', $term_evaluation_columns);
161        $rsm = new ResultSetMapping();
162        $rsm->addScalarResult('link', 'link', 'string');
163        $rsm->addScalarResult('icon', 'icon', 'string');
164        $rsm->addScalarResult('date', 'date', 'date');
165        $rsm->addScalarResult('title', 'title', 'string');
166        $rsm->addScalarResult('text', 'text', 'string');
167        $rsm->addScalarResult('score', 'score', 'string');
168        $rsm->addScalarResult('time_relevance', 'time_relevance', 'string');
169        $this->sqlExecute('DROP FUNCTION IF EXISTS get_quality');
170        $this->sqlExecute(<<<'ZZZZZZZZZZ'
171            CREATE FUNCTION get_quality (content TEXT, regex TEXT)
172                RETURNS FLOAT DETERMINISTIC
173                RETURN (LENGTH(content) - LENGTH(REGEXP_REPLACE(content, regex, ''))) / SQRT(LENGTH(content))
174            ZZZZZZZZZZ);
175        $sql = <<<ZZZZZZZZZZ
176            WITH
177                {$page_withs_sql},
178                merged AS ({$merged_sql}),
179                concatted AS (
180                    SELECT
181                        *,
182                        CONCAT(IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(text, '')) AS concatted_content,
183                        LEAST(1.0, GREATEST(0.0, time_relevance)) AS norm_time_relevance
184                    FROM merged
185                ),
186                evaluated AS (
187                    SELECT
188                        *,
189                        {$term_evaluation_sql}
190                    FROM concatted
191                ),
192                scored AS (
193                    SELECT 
194                        *, 
195                        (
196                            (1 - 1 / (1 + ({$term_quality_sql}))) * norm_time_relevance
197                        ) AS score
198                    FROM evaluated
199                )
200            SELECT *
201            FROM scored
202            WHERE score > 0
203            ORDER BY score DESC
204            ZZZZZZZZZZ;
205        $this->log()->debug("Search SQL: {$sql}");
206        $sql_results = $this->entityManager()->createNativeQuery($sql, $rsm)->getArrayResult();
207        return array_map(function ($row) use ($terms) {
208            return [
209                'link' => $row['link'],
210                'icon' => $row['icon'],
211                'date' => $row['date'],
212                'title' => $row['title'],
213                'text' => $this->searchUtils()->getCutout(strip_tags($row['text'] ?? ''), $terms) ?: null,
214                'score' => round($row['score'], 5),
215                'debug' => implode(' / ', [
216                    'Score: '.round($row['score'], 5),
217                    'Time relevance: '.round($row['time_relevance'], 5),
218                ]),
219            ];
220        }, $sql_results);
221    }
222
223    protected function sqlExecute(string $sql): void {
224        $this->entityManager()->createNativeQuery($sql, new ResultSetMapping())->execute();
225    }
226
227    /** @return array<string> */
228    public function getDateFormattings(?\DateTime $date): array {
229        if ($date === null) {
230            return [];
231        }
232        return [
233            $date->format('Y-m-d'),
234            $date->format('d.m.Y'),
235            $date->format('j.n.Y'),
236        ];
237    }
238
239    public function getDateSql(string $field, string $term): ?string {
240        $result = $this->dateUtils()->parseDateTimeRange($term);
241        if ($result === null) {
242            return null;
243        }
244        return <<<ZZZZZZZZZZ
245            (
246                {$field} >= '{$result['start']->format('Y-m-d')}'
247                AND {$field} < '{$result['end']->format('Y-m-d')}'
248            )
249            ZZZZZZZZZZ;
250    }
251
252    /** @param array<string> $search_terms */
253    public function getCutout(string $text, array $search_terms, int $size = 100): string {
254        $text = $this->censorEmails($text);
255        $offsets_by_term = $this->getOffsets($text, $search_terms);
256
257        $text_length = mb_strlen($text);
258        $term_lengths = [];
259        $term_scores = [];
260        foreach ($search_terms as $search_term) {
261            $term_length = mb_strlen($search_term);
262            $term_lengths[] = $term_length;
263            $term_scores[] = log($term_length) + 1;
264        }
265
266        $all_end_offsets = [];
267        for ($i = 0; $i < count($offsets_by_term); $i++) {
268            $term_length = $term_lengths[$i];
269            foreach ($offsets_by_term[$i] as $offset) {
270                $all_end_offsets[] = $offset + $term_length;
271            }
272        }
273        $all_end_offsets[] = $text_length;
274        sort($all_end_offsets);
275
276        $best_cutout_start = 0;
277        $best_cutout_end = 0;
278        $best_cutout_score = 0;
279        $start_idxs = array_map(fn () => -1, $search_terms);
280        $end_idxs = array_map(fn () => -1, $search_terms);
281        $after_all = $text_length + 1;
282        foreach ($all_end_offsets as $offset) {
283            $start = $offset;
284            $score = 1;
285            for ($i = 0; $i < count($search_terms); $i++) {
286                $term_length = $term_lengths[$i];
287                $offsets = $offsets_by_term[$i];
288                while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) {
289                    $end_idxs[$i]++;
290                }
291                while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) {
292                    $start_idxs[$i]++;
293                }
294                $num = $end_idxs[$i] - $start_idxs[$i];
295                $term_score = $term_scores[$i];
296                $score *= ($num * $term_score) + 1;
297                if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) {
298                    $start = $offsets[$start_idxs[$i] + 1];
299                }
300            }
301            if ($score > $best_cutout_score) {
302                $best_cutout_score = $score;
303                $best_cutout_start = $start;
304                $best_cutout_end = $offset;
305            }
306        }
307        $best_cutout_length = $best_cutout_end - $best_cutout_start;
308        $margin_size = ($size - $best_cutout_length) / 2;
309        $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size)));
310        return implode('', [
311            $offset === 0 ? '' : '…',
312            trim(mb_substr($text, $offset, $size)),
313            ($offset + $size >= $text_length) ? '' : '…',
314        ]);
315    }
316
317    public function censorEmails(string $text): string {
318        return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? '';
319    }
320
321    /**
322     * @param array<string> $search_terms
323     *
324     * @return array<array<int>>
325     */
326    public function getOffsets(string $text, array $search_terms): array {
327        $text_length = mb_strlen($text);
328        $offsets_by_term = [];
329        foreach ($search_terms as $search_term) {
330            $term_length = mb_strlen($search_term);
331            $term_regex = preg_quote($search_term, '/');
332            $parts = preg_split("/({$term_regex})/ui", $text) ?: [];
333            $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts);
334            unset($parts);
335            $offsets = [];
336            $offset = 0;
337            $sanity_check = false;
338            foreach ($part_lengths as $part_length) {
339                $offset += $part_length;
340                $sanity_check = $offset === $text_length;
341                if (!$sanity_check) { // Don't add the last offset; it's just the text length
342                    $offsets[] = $offset;
343                }
344                $offset += $term_length;
345            }
346            assert($sanity_check, 'Cutout offset sanity check failed');
347            $offsets_by_term[] = $offsets;
348        }
349        return $offsets_by_term;
350    }
351
352    /** @param array<string> $search_terms */
353    public function highlight(string $text, array $search_terms): string {
354        $offsets_by_term = $this->getOffsets($text, $search_terms);
355        $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms);
356
357        $ranges = [];
358        for ($i = 0; $i < count($offsets_by_term); $i++) {
359            $term_length = $term_lengths[$i];
360            foreach ($offsets_by_term[$i] as $offset) {
361                $ranges[] = [$offset, $offset + $term_length];
362            }
363        }
364        $merged_ranges = $this->normalizeRanges($ranges);
365
366        $out = '';
367        $start_tag = '<span class="highlight">';
368        $end_tag = '</span>';
369        $last_end = 0;
370        foreach ($merged_ranges as $range) {
371            $out .= mb_substr($text, $last_end, $range[0] - $last_end);
372            $out .= $start_tag;
373            $out .= mb_substr($text, $range[0], $range[1] - $range[0]);
374            $out .= $end_tag;
375            $last_end = $range[1];
376        }
377        $out .= mb_substr($text, $last_end);
378        return $out;
379    }
380
381    /**
382     * @param array<array{0:int, 1:int}> $ranges
383     *
384     * @return array<array{0:int, 1:int}>
385     */
386    public function normalizeRanges(array $ranges): array {
387        usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]);
388        $normalized_ranges = [];
389        $num_ranges = count($ranges);
390        $i = 0;
391        while ($i < $num_ranges) {
392            $range = $ranges[$i];
393            $start = $range[0];
394            $end = $range[1];
395            while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range
396                $end = max($end, $ranges[$i + 1][1]);
397                $i++;
398            }
399            $normalized_ranges[] = [$start, $end];
400            $i++;
401        }
402        return $normalized_ranges;
403    }
404}