Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
69.80% covered (warning)
69.80%
171 / 245
83.33% covered (warning)
83.33%
10 / 12
CRAP
0.00% covered (danger)
0.00%
0 / 1
SearchUtils
69.80% covered (warning)
69.80%
171 / 245
83.33% covered (warning)
83.33%
10 / 12
115.16
0.00% covered (danger)
0.00%
0 / 1
 getSearchResults
0.00% covered (danger)
0.00%
0 / 73
0.00% covered (danger)
0.00%
0 / 1
90
 sqlExecute
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getDateFormattings
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 getDateSql
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 getSnippetsWhereSql
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 getStaticResultQuery
100.00% covered (success)
100.00%
30 / 30
100.00% covered (success)
100.00%
1 / 1
2
 unionAllQueries
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
5
 getCutout
100.00% covered (success)
100.00%
49 / 49
100.00% covered (success)
100.00%
1 / 1
12
 censorEmails
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getOffsets
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 highlight
100.00% covered (success)
100.00%
20 / 20
100.00% covered (success)
100.00%
1 / 1
4
 normalizeRanges
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
1<?php
2
3namespace Olz\Suche\Utils;
4
5use Doctrine\ORM\Query\ResultSetMapping;
6use Olz\Anniversary\Components\OlzAnniversary\OlzAnniversary;
7use Olz\Apps\Anmelden\Components\OlzAnmelden\OlzAnmelden;
8use Olz\Apps\Commands\Components\OlzCommands\OlzCommands;
9use Olz\Apps\Files\Components\OlzFiles\OlzFiles;
10use Olz\Apps\Logs\Components\OlzLogs\OlzLogs;
11use Olz\Apps\Members\Components\OlzMembers\OlzMembers;
12use Olz\Apps\Monitoring\Components\OlzMonitoring\OlzMonitoring;
13use Olz\Apps\Newsletter\Components\OlzNewsletter\OlzNewsletter;
14use Olz\Apps\Oev\Components\OlzOev\OlzOev;
15use Olz\Apps\Panini2024\Components\OlzPanini2024\OlzPanini2024;
16use Olz\Apps\Panini2024\Components\OlzPanini2024All\OlzPanini2024All;
17use Olz\Apps\Panini2024\Components\OlzPanini2024Masks\OlzPanini2024Masks;
18use Olz\Apps\Quiz\Components\OlzQuiz\OlzQuiz;
19use Olz\Apps\Results\Components\OlzResults\OlzResults;
20use Olz\Apps\SearchEngines\Components\OlzSearchEngines\OlzSearchEngines;
21use Olz\Apps\Statistics\Components\OlzStatistics\OlzStatistics;
22use Olz\Apps\Youtube\Components\OlzYoutube\OlzYoutube;
23use Olz\Components\Auth\OlzEmailReaktion\OlzEmailReaktion;
24use Olz\Components\Common\OlzRootComponent;
25use Olz\Components\OlzHtmlSitemap\OlzHtmlSitemap;
26use Olz\Components\OtherPages\OlzAngebot\OlzAngebot;
27use Olz\Components\OtherPages\OlzDatenschutz\OlzDatenschutz;
28use Olz\Components\OtherPages\OlzFuerEinsteiger\OlzFuerEinsteiger;
29use Olz\Faq\Components\OlzFaqDetail\OlzFaqDetail;
30use Olz\Faq\Components\OlzFaqList\OlzFaqList;
31use Olz\Karten\Components\OlzKarteDetail\OlzKarteDetail;
32use Olz\Karten\Components\OlzKarten\OlzKarten;
33use Olz\News\Components\OlzNewsDetail\OlzNewsDetail;
34use Olz\News\Components\OlzNewsList\OlzNewsList;
35use Olz\Repository\Snippets\PredefinedSnippet;
36use Olz\Roles\Components\OlzRoleDetail\OlzRoleDetail;
37use Olz\Roles\Components\OlzVerein\OlzVerein;
38use Olz\Service\Components\OlzService\OlzService;
39use Olz\Startseite\Components\OlzStartseite\OlzStartseite;
40use Olz\Suche\Components\OlzSuche\OlzSuche;
41use Olz\Termine\Components\OlzTerminDetail\OlzTerminDetail;
42use Olz\Termine\Components\OlzTermineList\OlzTermineList;
43use Olz\Termine\Components\OlzTerminLocationDetail\OlzTerminLocationDetail;
44use Olz\Termine\Components\OlzTerminLocationsList\OlzTerminLocationsList;
45use Olz\Termine\Components\OlzTerminTemplateDetail\OlzTerminTemplateDetail;
46use Olz\Termine\Components\OlzTerminTemplatesList\OlzTerminTemplatesList;
47use Olz\Users\Components\OlzUserDetail\OlzUserDetail;
48use Olz\Utils\WithUtilsTrait;
49
50/**
51 * @phpstan-type SearchResult array{
52 *   link: non-empty-string,
53 *   icon: ?non-empty-string,
54 *   date: ?\DateTime,
55 *   title: non-empty-string,
56 *   text: ?non-empty-string,
57 *   score: float,
58 *   debug: string,
59 * }
60 * @phpstan-type StaticResult array{
61 *   link: string,
62 *   icon?: ?string,
63 *   date?: ?\DateTime,
64 *   title: string,
65 *   text?: ?string,
66 *   timeRelevance?: ?float,
67 * }
68 * @phpstan-type WithQuery array{
69 *   with: array<string>,
70 *   query: string,
71 * }
72 */
73class SearchUtils {
74    use WithUtilsTrait;
75
76    /** @var array<class-string<OlzRootComponent<mixed>>> */
77    protected static array $all_page_classes = [
78        // All classes that extend `OlzRootComponent` should be listed here:
79        OlzAnmelden::class,
80        OlzAnniversary::class,
81        OlzCommands::class,
82        OlzFiles::class,
83        OlzLogs::class,
84        OlzMembers::class,
85        OlzMonitoring::class,
86        OlzNewsletter::class,
87        OlzOev::class,
88        OlzPanini2024::class,
89        OlzPanini2024All::class,
90        OlzPanini2024Masks::class,
91        OlzQuiz::class,
92        OlzResults::class,
93        OlzSearchEngines::class,
94        OlzStatistics::class,
95        OlzYoutube::class,
96        OlzEmailReaktion::class,
97        OlzHtmlSitemap::class,
98        OlzDatenschutz::class,
99        OlzFuerEinsteiger::class,
100        OlzAngebot::class,
101        OlzFaqDetail::class,
102        OlzFaqList::class,
103        OlzKarteDetail::class,
104        OlzKarten::class,
105        OlzNewsDetail::class,
106        OlzNewsList::class,
107        OlzRoleDetail::class,
108        OlzVerein::class,
109        OlzService::class,
110        OlzStartseite::class,
111        OlzSuche::class,
112        OlzTerminDetail::class,
113        OlzTermineList::class,
114        OlzTerminLocationDetail::class,
115        OlzTerminLocationsList::class,
116        OlzTerminTemplateDetail::class,
117        OlzTerminTemplatesList::class,
118        OlzUserDetail::class,
119    ];
120
121    /**
122     * @param array<string> $terms
123     *
124     * @return array<SearchResult>
125     */
126    public function getSearchResults(array $terms): array {
127        $db = $this->dbUtils()->getDb();
128        $num_terms = count($terms);
129        $esc_terms = array_map(fn ($term) => $db->real_escape_string($term), $terms);
130        $page_queries = [];
131        $page_withs = [];
132        foreach (self::$all_page_classes as $page_class) {
133            $page = new $page_class();
134            $result = $page->searchSql($esc_terms);
135            if (is_array($result)) {
136                $page_queries[] = $result['query'];
137                foreach ($result['with'] as $with) {
138                    $page_withs[] = $with;
139                }
140            } elseif (is_string($result)) {
141                $page_queries[] = $result;
142            } // else ignore
143        }
144        $page_withs_sql = implode(',', $page_withs);
145        $merged_sql = implode(' UNION ALL ', $page_queries);
146
147        $idx = 0;
148        $term_evaluation_sqls = [];
149        $term_evaluation_columns = [];
150        foreach ($terms as $term) {
151            $esc_term = $db->real_escape_string(preg_quote($term));
152            $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}') AS term{$idx}_any";
153            $term_evaluation_columns[] = "term{$idx}_any";
154            // Add preference for word matches
155            $term_evaluation_sqls[] = "get_quality(concatted_content, '(?=\\\\W|^){$esc_term}') AS term{$idx}_prefix";
156            $term_evaluation_columns[] = "term{$idx}_prefix";
157            $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_term}(?=\\\\W|$)') AS term{$idx}_suffix";
158            $term_evaluation_columns[] = "term{$idx}_suffix";
159            $idx++;
160        }
161        // Add preference to term combination matches
162        for ($num_combined = 2; $num_combined <= min(3, $num_terms); $num_combined++) {
163            for ($start_combined = 0; $start_combined <= $num_terms - $num_combined; $start_combined++) {
164                $combined_terms = array_slice($terms, $start_combined, $num_combined);
165                $esc_combined_terms = $db->real_escape_string(implode('(\W{0,5}|\s*)', array_map(
166                    fn ($term) => preg_quote($term),
167                    $combined_terms
168                )));
169                $term_evaluation_sqls[] = "get_quality(concatted_content, '{$esc_combined_terms}') AS terms_{$start_combined}_{$num_combined}";
170                $term_evaluation_columns[] = "terms_{$start_combined}_{$num_combined}";
171                // TODO: Combined date formattings?
172            }
173        }
174        $term_evaluation_sql = implode(',', $term_evaluation_sqls);
175        $term_quality_sql = implode('+', $term_evaluation_columns);
176        $rsm = new ResultSetMapping();
177        $rsm->addScalarResult('link', 'link', 'string');
178        $rsm->addScalarResult('icon', 'icon', 'string');
179        $rsm->addScalarResult('date', 'date', 'date');
180        $rsm->addScalarResult('title', 'title', 'string');
181        $rsm->addScalarResult('text', 'text', 'string');
182        $rsm->addScalarResult('score', 'score', 'string');
183        $rsm->addScalarResult('time_relevance', 'time_relevance', 'string');
184        $this->sqlExecute('DROP FUNCTION IF EXISTS get_quality');
185        $this->sqlExecute(<<<'ZZZZZZZZZZ'
186            CREATE FUNCTION get_quality (content TEXT, regex TEXT)
187                RETURNS FLOAT DETERMINISTIC
188                RETURN (LENGTH(content) - LENGTH(REGEXP_REPLACE(content, regex, ''))) / SQRT(LENGTH(content))
189            ZZZZZZZZZZ);
190        $sql = <<<ZZZZZZZZZZ
191            WITH
192                {$page_withs_sql},
193                merged AS ({$merged_sql}),
194                concatted AS (
195                    SELECT
196                        *,
197                        CONCAT(IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(title, ''), ' ', IFNULL(text, '')) AS concatted_content,
198                        LEAST(1.0, GREATEST(0.0, time_relevance)) AS norm_time_relevance
199                    FROM merged
200                ),
201                evaluated AS (
202                    SELECT
203                        *,
204                        {$term_evaluation_sql}
205                    FROM concatted
206                ),
207                scored AS (
208                    SELECT 
209                        *, 
210                        (
211                            (1 - 1 / (1 + ({$term_quality_sql}))) * norm_time_relevance
212                        ) AS score
213                    FROM evaluated
214                )
215            SELECT *
216            FROM scored
217            WHERE score > 0
218            ORDER BY score DESC
219            ZZZZZZZZZZ;
220        $this->log()->debug("Search SQL: {$sql}");
221        $sql_results = $this->entityManager()->createNativeQuery($sql, $rsm)->getArrayResult();
222        return array_map(function ($row) use ($terms) {
223            return [
224                'link' => $row['link'],
225                'icon' => $row['icon'],
226                'date' => $row['date'],
227                'title' => $row['title'],
228                'text' => $this->searchUtils()->getCutout(strip_tags($row['text'] ?? ''), $terms) ?: null,
229                'score' => round($row['score'], 5),
230                'debug' => implode(' / ', [
231                    'Score: '.round($row['score'], 5),
232                    'Time relevance: '.round($row['time_relevance'], 5),
233                ]),
234            ];
235        }, $sql_results);
236    }
237
238    protected function sqlExecute(string $sql): void {
239        $this->entityManager()->createNativeQuery($sql, new ResultSetMapping())->execute();
240    }
241
242    /** @return array<string> */
243    public function getDateFormattings(?\DateTime $date): array {
244        if ($date === null) {
245            return [];
246        }
247        return [
248            $date->format('Y-m-d'),
249            $date->format('d.m.Y'),
250            $date->format('j.n.Y'),
251        ];
252    }
253
254    public function getDateSql(string $field, string $term): ?string {
255        $result = $this->dateUtils()->parseDateTimeRange($term);
256        if ($result === null) {
257            return null;
258        }
259        return <<<ZZZZZZZZZZ
260            (
261                {$field} >= '{$result['start']->format('Y-m-d')}'
262                AND {$field} < '{$result['end']->format('Y-m-d')}'
263            )
264            ZZZZZZZZZZ;
265    }
266
267    /**
268     * @param array<PredefinedSnippet|int<1,max>> $snippets
269     * @param array<string>                       $terms
270     */
271    public function getSnippetsWhereSql(array $snippets, array $terms): ?string {
272        $terms_where = implode(' AND ', array_map(fn ($term) => "text LIKE '%{$term}%'", $terms));
273        $ids = implode(',', array_map(
274            fn ($snippet) => is_int($snippet) ? "'{$snippet}'" : "'{$snippet->value}'",
275            $snippets,
276        ));
277        return <<<ZZZZZZZZZZ
278            on_off = '1'
279            AND {$terms_where}
280            AND id IN ({$ids})
281            ZZZZZZZZZZ;
282    }
283
284    /**
285     * @param StaticResult  $static_result
286     * @param array<string> $terms
287     *
288     * @return WithQuery
289     */
290    public function getStaticResultQuery(array $static_result, array $terms): array {
291        $ident = md5(json_encode($static_result) ?: '');
292        $esc_link = $this->generalUtils()->internalSqlEscape($static_result['link']);
293        $esc_icon = $this->generalUtils()->internalNullableSqlEscape($static_result['icon'] ?? null);
294        // TODO: date
295        $esc_title = $this->generalUtils()->internalSqlEscape($static_result['title']);
296        $esc_text = $this->generalUtils()->internalNullableSqlEscape($static_result['text'] ?? null);
297        $esc_time_relevance = floatval($static_result['timeRelevance'] ?? '1.0');
298        $static_where = implode(' AND ', array_map(function ($term) {
299            return <<<ZZZZZZZZZZ
300                (
301                    title LIKE '%{$term}%'
302                    OR text LIKE '%{$term}%'
303                )
304                ZZZZZZZZZZ;
305        }, $terms));
306        return [
307            'with' => [
308                <<<ZZZZZZZZZZ
309                    static_{$ident} AS (
310                        SELECT
311                            '{$esc_link}' AS link,
312                            {$esc_icon} AS icon,
313                            NULL AS date,
314                            '{$esc_title}' AS title,
315                            {$esc_text} AS text
316                    )
317                    ZZZZZZZZZZ,
318            ],
319            'query' => <<<ZZZZZZZZZZ
320                SELECT
321                    link, icon, date, title, text,
322                    {$esc_time_relevance} AS time_relevance
323                FROM static_{$ident}
324                WHERE {$static_where}
325                ZZZZZZZZZZ,
326        ];
327    }
328
329    /**
330     * @param array<WithQuery|string> $queries
331     *
332     * @return WithQuery
333     */
334    public function unionAllQueries(array $queries): array {
335        $out_withs = [];
336        $out_queries = [];
337        foreach ($queries as $query) {
338            if (is_array($query)) {
339                $out_queries[] = $query['query'];
340                foreach ($query['with'] as $with) {
341                    $out_withs[] = $with;
342                }
343            } elseif (is_string($query)) {
344                $out_queries[] = $query;
345            } // else ignore
346        }
347        return [
348            'with' => $out_withs,
349            'query' => implode(' UNION ALL ', $out_queries),
350        ];
351    }
352
353    /** @param array<string> $search_terms */
354    public function getCutout(string $text, array $search_terms, int $size = 100): string {
355        $text = $this->censorEmails($text);
356        $offsets_by_term = $this->getOffsets($text, $search_terms);
357
358        $text_length = mb_strlen($text);
359        $term_lengths = [];
360        $term_scores = [];
361        foreach ($search_terms as $search_term) {
362            $term_length = mb_strlen($search_term);
363            $term_lengths[] = $term_length;
364            $term_scores[] = log($term_length) + 1;
365        }
366
367        $all_end_offsets = [];
368        for ($i = 0; $i < count($offsets_by_term); $i++) {
369            $term_length = $term_lengths[$i];
370            foreach ($offsets_by_term[$i] as $offset) {
371                $all_end_offsets[] = $offset + $term_length;
372            }
373        }
374        $all_end_offsets[] = $text_length;
375        sort($all_end_offsets);
376
377        $best_cutout_start = 0;
378        $best_cutout_end = 0;
379        $best_cutout_score = 0;
380        $start_idxs = array_map(fn () => -1, $search_terms);
381        $end_idxs = array_map(fn () => -1, $search_terms);
382        $after_all = $text_length + 1;
383        foreach ($all_end_offsets as $offset) {
384            $start = $offset;
385            $score = 1;
386            for ($i = 0; $i < count($search_terms); $i++) {
387                $term_length = $term_lengths[$i];
388                $offsets = $offsets_by_term[$i];
389                while (($offsets[$end_idxs[$i] + 1] ?? $after_all) + $term_length <= $offset) {
390                    $end_idxs[$i]++;
391                }
392                while (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $offset - $size) {
393                    $start_idxs[$i]++;
394                }
395                $num = $end_idxs[$i] - $start_idxs[$i];
396                $term_score = $term_scores[$i];
397                $score *= ($num * $term_score) + 1;
398                if (($offsets[$start_idxs[$i] + 1] ?? $after_all) < $start) {
399                    $start = $offsets[$start_idxs[$i] + 1];
400                }
401            }
402            if ($score > $best_cutout_score) {
403                $best_cutout_score = $score;
404                $best_cutout_start = $start;
405                $best_cutout_end = $offset;
406            }
407        }
408        $best_cutout_length = $best_cutout_end - $best_cutout_start;
409        $margin_size = ($size - $best_cutout_length) / 2;
410        $offset = max(0, min($text_length - $size, $best_cutout_start - intval($margin_size)));
411        return implode('', [
412            $offset === 0 ? '' : '…',
413            trim(mb_substr($text, $offset, $size)),
414            ($offset + $size >= $text_length) ? '' : '…',
415        ]);
416    }
417
418    public function censorEmails(string $text): string {
419        return preg_replace('/([A-Z0-9a-z._%+-]+)@([A-Za-z0-9.-]+)/', '***@***', $text) ?? '';
420    }
421
422    /**
423     * @param array<string> $search_terms
424     *
425     * @return array<array<int>>
426     */
427    public function getOffsets(string $text, array $search_terms): array {
428        $text_length = mb_strlen($text);
429        $offsets_by_term = [];
430        foreach ($search_terms as $search_term) {
431            $term_length = mb_strlen($search_term);
432            $term_regex = preg_quote($search_term, '/');
433            $parts = preg_split("/({$term_regex})/ui", $text) ?: [];
434            $part_lengths = array_map(fn ($part) => mb_strlen($part), $parts);
435            unset($parts);
436            $offsets = [];
437            $offset = 0;
438            $sanity_check = false;
439            foreach ($part_lengths as $part_length) {
440                $offset += $part_length;
441                $sanity_check = $offset === $text_length;
442                if (!$sanity_check) { // Don't add the last offset; it's just the text length
443                    $offsets[] = $offset;
444                }
445                $offset += $term_length;
446            }
447            assert($sanity_check, 'Cutout offset sanity check failed');
448            $offsets_by_term[] = $offsets;
449        }
450        return $offsets_by_term;
451    }
452
453    /** @param array<string> $search_terms */
454    public function highlight(string $text, array $search_terms): string {
455        $offsets_by_term = $this->getOffsets($text, $search_terms);
456        $term_lengths = array_map(fn ($term) => mb_strlen($term), $search_terms);
457
458        $ranges = [];
459        for ($i = 0; $i < count($offsets_by_term); $i++) {
460            $term_length = $term_lengths[$i];
461            foreach ($offsets_by_term[$i] as $offset) {
462                $ranges[] = [$offset, $offset + $term_length];
463            }
464        }
465        $merged_ranges = $this->normalizeRanges($ranges);
466
467        $out = '';
468        $start_tag = '<span class="highlight">';
469        $end_tag = '</span>';
470        $last_end = 0;
471        foreach ($merged_ranges as $range) {
472            $out .= mb_substr($text, $last_end, $range[0] - $last_end);
473            $out .= $start_tag;
474            $out .= mb_substr($text, $range[0], $range[1] - $range[0]);
475            $out .= $end_tag;
476            $last_end = $range[1];
477        }
478        $out .= mb_substr($text, $last_end);
479        return $out;
480    }
481
482    /**
483     * @param array<array{0:int, 1:int}> $ranges
484     *
485     * @return array<array{0:int, 1:int}>
486     */
487    public function normalizeRanges(array $ranges): array {
488        usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]);
489        $normalized_ranges = [];
490        $num_ranges = count($ranges);
491        $i = 0;
492        while ($i < $num_ranges) {
493            $range = $ranges[$i];
494            $start = $range[0];
495            $end = $range[1];
496            while ($i + 1 < $num_ranges && $ranges[$i + 1][0] <= $end) { // merge next range
497                $end = max($end, $ranges[$i + 1][1]);
498                $i++;
499            }
500            $normalized_ranges[] = [$start, $end];
501            $i++;
502        }
503        return $normalized_ranges;
504    }
505}