regex_syntax/hir/
translate.rs

1/*!
2Defines a translator that converts an `Ast` to an `Hir`.
3*/
4
5use core::cell::{Cell, RefCell};
6
7use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
8
9use crate::{
10    ast::{self, Ast, Span, Visitor},
11    either::Either,
12    hir::{self, Error, ErrorKind, Hir, HirKind},
13    unicode::{self, ClassQuery},
14};
15
16type Result<T> = core::result::Result<T, Error>;
17
18/// A builder for constructing an AST->HIR translator.
19#[derive(Clone, Debug)]
20pub struct TranslatorBuilder {
21    utf8: bool,
22    line_terminator: u8,
23    flags: Flags,
24}
25
26impl Default for TranslatorBuilder {
27    fn default() -> TranslatorBuilder {
28        TranslatorBuilder::new()
29    }
30}
31
32impl TranslatorBuilder {
33    /// Create a new translator builder with a default configuration.
34    pub fn new() -> TranslatorBuilder {
35        TranslatorBuilder {
36            utf8: true,
37            line_terminator: b'\n',
38            flags: Flags::default(),
39        }
40    }
41
42    /// Build a translator using the current configuration.
43    pub fn build(&self) -> Translator {
44        Translator {
45            stack: RefCell::new(vec![]),
46            flags: Cell::new(self.flags),
47            utf8: self.utf8,
48            line_terminator: self.line_terminator,
49        }
50    }
51
52    /// When disabled, translation will permit the construction of a regular
53    /// expression that may match invalid UTF-8.
54    ///
55    /// When enabled (the default), the translator is guaranteed to produce an
56    /// expression that, for non-empty matches, will only ever produce spans
57    /// that are entirely valid UTF-8 (otherwise, the translator will return an
58    /// error).
59    ///
60    /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
61    /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
62    /// syntax) will be allowed even though they can produce matches that split
63    /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
64    /// matches, and it is expected that the regex engine itself must handle
65    /// these cases if necessary (perhaps by suppressing any zero-width matches
66    /// that split a codepoint).
67    pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
68        self.utf8 = yes;
69        self
70    }
71
72    /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
73    ///
74    /// Namely, instead of `.` (by default) matching everything except for `\n`,
75    /// this will cause `.` to match everything except for the byte given.
76    ///
77    /// If `.` is used in a context where Unicode mode is enabled and this byte
78    /// isn't ASCII, then an error will be returned. When Unicode mode is
79    /// disabled, then any byte is permitted, but will return an error if UTF-8
80    /// mode is enabled and it is a non-ASCII byte.
81    ///
82    /// In short, any ASCII value for a line terminator is always okay. But a
83    /// non-ASCII byte might result in an error depending on whether Unicode
84    /// mode or UTF-8 mode are enabled.
85    ///
86    /// Note that if `R` mode is enabled then it always takes precedence and
87    /// the line terminator will be treated as `\r` and `\n` simultaneously.
88    ///
89    /// Note also that this *doesn't* impact the look-around assertions
90    /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
91    /// configuration in the regex engine itself.
92    pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
93        self.line_terminator = byte;
94        self
95    }
96
97    /// Enable or disable the case insensitive flag (`i`) by default.
98    pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
99        self.flags.case_insensitive = if yes { Some(true) } else { None };
100        self
101    }
102
103    /// Enable or disable the multi-line matching flag (`m`) by default.
104    pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
105        self.flags.multi_line = if yes { Some(true) } else { None };
106        self
107    }
108
109    /// Enable or disable the "dot matches any character" flag (`s`) by
110    /// default.
111    pub fn dot_matches_new_line(
112        &mut self,
113        yes: bool,
114    ) -> &mut TranslatorBuilder {
115        self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
116        self
117    }
118
119    /// Enable or disable the CRLF mode flag (`R`) by default.
120    pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
121        self.flags.crlf = if yes { Some(true) } else { None };
122        self
123    }
124
125    /// Enable or disable the "swap greed" flag (`U`) by default.
126    pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
127        self.flags.swap_greed = if yes { Some(true) } else { None };
128        self
129    }
130
131    /// Enable or disable the Unicode flag (`u`) by default.
132    pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
133        self.flags.unicode = if yes { None } else { Some(false) };
134        self
135    }
136}
137
138/// A translator maps abstract syntax to a high level intermediate
139/// representation.
140///
141/// A translator may be benefit from reuse. That is, a translator can translate
142/// many abstract syntax trees.
143///
144/// A `Translator` can be configured in more detail via a
145/// [`TranslatorBuilder`].
146#[derive(Clone, Debug)]
147pub struct Translator {
148    /// Our call stack, but on the heap.
149    stack: RefCell<Vec<HirFrame>>,
150    /// The current flag settings.
151    flags: Cell<Flags>,
152    /// Whether we're allowed to produce HIR that can match arbitrary bytes.
153    utf8: bool,
154    /// The line terminator to use for `.`.
155    line_terminator: u8,
156}
157
158impl Translator {
159    /// Create a new translator using the default configuration.
160    pub fn new() -> Translator {
161        TranslatorBuilder::new().build()
162    }
163
164    /// Translate the given abstract syntax tree (AST) into a high level
165    /// intermediate representation (HIR).
166    ///
167    /// If there was a problem doing the translation, then an HIR-specific
168    /// error is returned.
169    ///
170    /// The original pattern string used to produce the `Ast` *must* also be
171    /// provided. The translator does not use the pattern string during any
172    /// correct translation, but is used for error reporting.
173    pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
174        ast::visit(ast, TranslatorI::new(self, pattern))
175    }
176}
177
178/// An HirFrame is a single stack frame, represented explicitly, which is
179/// created for each item in the Ast that we traverse.
180///
181/// Note that technically, this type doesn't represent our entire stack
182/// frame. In particular, the Ast visitor represents any state associated with
183/// traversing the Ast itself.
184#[derive(Clone, Debug)]
185enum HirFrame {
186    /// An arbitrary HIR expression. These get pushed whenever we hit a base
187    /// case in the Ast. They get popped after an inductive (i.e., recursive)
188    /// step is complete.
189    Expr(Hir),
190    /// A literal that is being constructed, character by character, from the
191    /// AST. We need this because the AST gives each individual character its
192    /// own node. So as we see characters, we peek at the top-most HirFrame.
193    /// If it's a literal, then we add to it. Otherwise, we push a new literal.
194    /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
195    Literal(Vec<u8>),
196    /// A Unicode character class. This frame is mutated as we descend into
197    /// the Ast of a character class (which is itself its own mini recursive
198    /// structure).
199    ClassUnicode(hir::ClassUnicode),
200    /// A byte-oriented character class. This frame is mutated as we descend
201    /// into the Ast of a character class (which is itself its own mini
202    /// recursive structure).
203    ///
204    /// Byte character classes are created when Unicode mode (`u`) is disabled.
205    /// If `utf8` is enabled (the default), then a byte character is only
206    /// permitted to match ASCII text.
207    ClassBytes(hir::ClassBytes),
208    /// This is pushed whenever a repetition is observed. After visiting every
209    /// sub-expression in the repetition, the translator's stack is expected to
210    /// have this sentinel at the top.
211    ///
212    /// This sentinel only exists to stop other things (like flattening
213    /// literals) from reaching across repetition operators.
214    Repetition,
215    /// This is pushed on to the stack upon first seeing any kind of capture,
216    /// indicated by parentheses (including non-capturing groups). It is popped
217    /// upon leaving a group.
218    Group {
219        /// The old active flags when this group was opened.
220        ///
221        /// If this group sets flags, then the new active flags are set to the
222        /// result of merging the old flags with the flags introduced by this
223        /// group. If the group doesn't set any flags, then this is simply
224        /// equivalent to whatever flags were set when the group was opened.
225        ///
226        /// When this group is popped, the active flags should be restored to
227        /// the flags set here.
228        ///
229        /// The "active" flags correspond to whatever flags are set in the
230        /// Translator.
231        old_flags: Flags,
232    },
233    /// This is pushed whenever a concatenation is observed. After visiting
234    /// every sub-expression in the concatenation, the translator's stack is
235    /// popped until it sees a Concat frame.
236    Concat,
237    /// This is pushed whenever an alternation is observed. After visiting
238    /// every sub-expression in the alternation, the translator's stack is
239    /// popped until it sees an Alternation frame.
240    Alternation,
241    /// This is pushed immediately before each sub-expression in an
242    /// alternation. This separates the branches of an alternation on the
243    /// stack and prevents literal flattening from reaching across alternation
244    /// branches.
245    ///
246    /// It is popped after each expression in a branch until an 'Alternation'
247    /// frame is observed when doing a post visit on an alternation.
248    AlternationBranch,
249}
250
251impl HirFrame {
252    /// Assert that the current stack frame is an Hir expression and return it.
253    fn unwrap_expr(self) -> Hir {
254        match self {
255            HirFrame::Expr(expr) => expr,
256            HirFrame::Literal(lit) => Hir::literal(lit),
257            _ => panic!("tried to unwrap expr from HirFrame, got: {self:?}"),
258        }
259    }
260
261    /// Assert that the current stack frame is a Unicode class expression and
262    /// return it.
263    fn unwrap_class_unicode(self) -> hir::ClassUnicode {
264        match self {
265            HirFrame::ClassUnicode(cls) => cls,
266            _ => panic!(
267                "tried to unwrap Unicode class \
268                 from HirFrame, got: {:?}",
269                self
270            ),
271        }
272    }
273
274    /// Assert that the current stack frame is a byte class expression and
275    /// return it.
276    fn unwrap_class_bytes(self) -> hir::ClassBytes {
277        match self {
278            HirFrame::ClassBytes(cls) => cls,
279            _ => panic!(
280                "tried to unwrap byte class \
281                 from HirFrame, got: {:?}",
282                self
283            ),
284        }
285    }
286
287    /// Assert that the current stack frame is a repetition sentinel. If it
288    /// isn't, then panic.
289    fn unwrap_repetition(self) {
290        match self {
291            HirFrame::Repetition => {}
292            _ => {
293                panic!(
294                    "tried to unwrap repetition from HirFrame, got: {self:?}"
295                )
296            }
297        }
298    }
299
300    /// Assert that the current stack frame is a group indicator and return
301    /// its corresponding flags (the flags that were active at the time the
302    /// group was entered).
303    fn unwrap_group(self) -> Flags {
304        match self {
305            HirFrame::Group { old_flags } => old_flags,
306            _ => {
307                panic!("tried to unwrap group from HirFrame, got: {self:?}")
308            }
309        }
310    }
311
312    /// Assert that the current stack frame is an alternation pipe sentinel. If
313    /// it isn't, then panic.
314    fn unwrap_alternation_pipe(self) {
315        match self {
316            HirFrame::AlternationBranch => {}
317            _ => {
318                panic!("tried to unwrap alt pipe from HirFrame, got: {self:?}")
319            }
320        }
321    }
322}
323
324impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
325    type Output = Hir;
326    type Err = Error;
327
328    fn finish(self) -> Result<Hir> {
329        // ... otherwise, we should have exactly one HIR on the stack.
330        assert_eq!(self.trans().stack.borrow().len(), 1);
331        Ok(self.pop().unwrap().unwrap_expr())
332    }
333
334    fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
335        match *ast {
336            Ast::ClassBracketed(_) => {
337                if self.flags().unicode() {
338                    let cls = hir::ClassUnicode::empty();
339                    self.push(HirFrame::ClassUnicode(cls));
340                } else {
341                    let cls = hir::ClassBytes::empty();
342                    self.push(HirFrame::ClassBytes(cls));
343                }
344            }
345            Ast::Repetition(_) => self.push(HirFrame::Repetition),
346            Ast::Group(ref x) => {
347                let old_flags = x
348                    .flags()
349                    .map(|ast| self.set_flags(ast))
350                    .unwrap_or_else(|| self.flags());
351                self.push(HirFrame::Group { old_flags });
352            }
353            Ast::Concat(_) => {
354                self.push(HirFrame::Concat);
355            }
356            Ast::Alternation(ref x) => {
357                self.push(HirFrame::Alternation);
358                if !x.asts.is_empty() {
359                    self.push(HirFrame::AlternationBranch);
360                }
361            }
362            _ => {}
363        }
364        Ok(())
365    }
366
367    fn visit_post(&mut self, ast: &Ast) -> Result<()> {
368        match *ast {
369            Ast::Empty(_) => {
370                self.push(HirFrame::Expr(Hir::empty()));
371            }
372            Ast::Flags(ref x) => {
373                self.set_flags(&x.flags);
374                // Flags in the AST are generally considered directives and
375                // not actual sub-expressions. However, they can be used in
376                // the concrete syntax like `((?i))`, and we need some kind of
377                // indication of an expression there, and Empty is the correct
378                // choice.
379                //
380                // There can also be things like `(?i)+`, but we rule those out
381                // in the parser. In the future, we might allow them for
382                // consistency sake.
383                self.push(HirFrame::Expr(Hir::empty()));
384            }
385            Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
386                Either::Right(byte) => self.push_byte(byte),
387                Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
388                    None => self.push_char(ch),
389                    Some(expr) => self.push(HirFrame::Expr(expr)),
390                },
391            },
392            Ast::Dot(ref span) => {
393                self.push(HirFrame::Expr(self.hir_dot(**span)?));
394            }
395            Ast::Assertion(ref x) => {
396                self.push(HirFrame::Expr(self.hir_assertion(x)?));
397            }
398            Ast::ClassPerl(ref x) => {
399                if self.flags().unicode() {
400                    let cls = self.hir_perl_unicode_class(x)?;
401                    let hcls = hir::Class::Unicode(cls);
402                    self.push(HirFrame::Expr(Hir::class(hcls)));
403                } else {
404                    let cls = self.hir_perl_byte_class(x)?;
405                    let hcls = hir::Class::Bytes(cls);
406                    self.push(HirFrame::Expr(Hir::class(hcls)));
407                }
408            }
409            Ast::ClassUnicode(ref x) => {
410                let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
411                self.push(HirFrame::Expr(Hir::class(cls)));
412            }
413            Ast::ClassBracketed(ref ast) => {
414                if self.flags().unicode() {
415                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
416                    self.unicode_fold_and_negate(
417                        &ast.span,
418                        ast.negated,
419                        &mut cls,
420                    )?;
421                    let expr = Hir::class(hir::Class::Unicode(cls));
422                    self.push(HirFrame::Expr(expr));
423                } else {
424                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
425                    self.bytes_fold_and_negate(
426                        &ast.span,
427                        ast.negated,
428                        &mut cls,
429                    )?;
430                    let expr = Hir::class(hir::Class::Bytes(cls));
431                    self.push(HirFrame::Expr(expr));
432                }
433            }
434            Ast::Repetition(ref x) => {
435                let expr = self.pop().unwrap().unwrap_expr();
436                self.pop().unwrap().unwrap_repetition();
437                self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
438            }
439            Ast::Group(ref x) => {
440                let expr = self.pop().unwrap().unwrap_expr();
441                let old_flags = self.pop().unwrap().unwrap_group();
442                self.trans().flags.set(old_flags);
443                self.push(HirFrame::Expr(self.hir_capture(x, expr)));
444            }
445            Ast::Concat(_) => {
446                let mut exprs = vec![];
447                while let Some(expr) = self.pop_concat_expr() {
448                    if !matches!(*expr.kind(), HirKind::Empty) {
449                        exprs.push(expr);
450                    }
451                }
452                exprs.reverse();
453                self.push(HirFrame::Expr(Hir::concat(exprs)));
454            }
455            Ast::Alternation(_) => {
456                let mut exprs = vec![];
457                while let Some(expr) = self.pop_alt_expr() {
458                    self.pop().unwrap().unwrap_alternation_pipe();
459                    exprs.push(expr);
460                }
461                exprs.reverse();
462                self.push(HirFrame::Expr(Hir::alternation(exprs)));
463            }
464        }
465        Ok(())
466    }
467
468    fn visit_alternation_in(&mut self) -> Result<()> {
469        self.push(HirFrame::AlternationBranch);
470        Ok(())
471    }
472
473    fn visit_class_set_item_pre(
474        &mut self,
475        ast: &ast::ClassSetItem,
476    ) -> Result<()> {
477        match *ast {
478            ast::ClassSetItem::Bracketed(_) => {
479                if self.flags().unicode() {
480                    let cls = hir::ClassUnicode::empty();
481                    self.push(HirFrame::ClassUnicode(cls));
482                } else {
483                    let cls = hir::ClassBytes::empty();
484                    self.push(HirFrame::ClassBytes(cls));
485                }
486            }
487            // We needn't handle the Union case here since the visitor will
488            // do it for us.
489            _ => {}
490        }
491        Ok(())
492    }
493
494    fn visit_class_set_item_post(
495        &mut self,
496        ast: &ast::ClassSetItem,
497    ) -> Result<()> {
498        match *ast {
499            ast::ClassSetItem::Empty(_) => {}
500            ast::ClassSetItem::Literal(ref x) => {
501                if self.flags().unicode() {
502                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
503                    cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
504                    self.push(HirFrame::ClassUnicode(cls));
505                } else {
506                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
507                    let byte = self.class_literal_byte(x)?;
508                    cls.push(hir::ClassBytesRange::new(byte, byte));
509                    self.push(HirFrame::ClassBytes(cls));
510                }
511            }
512            ast::ClassSetItem::Range(ref x) => {
513                if self.flags().unicode() {
514                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
515                    cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
516                    self.push(HirFrame::ClassUnicode(cls));
517                } else {
518                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
519                    let start = self.class_literal_byte(&x.start)?;
520                    let end = self.class_literal_byte(&x.end)?;
521                    cls.push(hir::ClassBytesRange::new(start, end));
522                    self.push(HirFrame::ClassBytes(cls));
523                }
524            }
525            ast::ClassSetItem::Ascii(ref x) => {
526                if self.flags().unicode() {
527                    let xcls = self.hir_ascii_unicode_class(x)?;
528                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
529                    cls.union(&xcls);
530                    self.push(HirFrame::ClassUnicode(cls));
531                } else {
532                    let xcls = self.hir_ascii_byte_class(x)?;
533                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
534                    cls.union(&xcls);
535                    self.push(HirFrame::ClassBytes(cls));
536                }
537            }
538            ast::ClassSetItem::Unicode(ref x) => {
539                let xcls = self.hir_unicode_class(x)?;
540                let mut cls = self.pop().unwrap().unwrap_class_unicode();
541                cls.union(&xcls);
542                self.push(HirFrame::ClassUnicode(cls));
543            }
544            ast::ClassSetItem::Perl(ref x) => {
545                if self.flags().unicode() {
546                    let xcls = self.hir_perl_unicode_class(x)?;
547                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
548                    cls.union(&xcls);
549                    self.push(HirFrame::ClassUnicode(cls));
550                } else {
551                    let xcls = self.hir_perl_byte_class(x)?;
552                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
553                    cls.union(&xcls);
554                    self.push(HirFrame::ClassBytes(cls));
555                }
556            }
557            ast::ClassSetItem::Bracketed(ref ast) => {
558                if self.flags().unicode() {
559                    let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
560                    self.unicode_fold_and_negate(
561                        &ast.span,
562                        ast.negated,
563                        &mut cls1,
564                    )?;
565
566                    let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
567                    cls2.union(&cls1);
568                    self.push(HirFrame::ClassUnicode(cls2));
569                } else {
570                    let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
571                    self.bytes_fold_and_negate(
572                        &ast.span,
573                        ast.negated,
574                        &mut cls1,
575                    )?;
576
577                    let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
578                    cls2.union(&cls1);
579                    self.push(HirFrame::ClassBytes(cls2));
580                }
581            }
582            // This is handled automatically by the visitor.
583            ast::ClassSetItem::Union(_) => {}
584        }
585        Ok(())
586    }
587
588    fn visit_class_set_binary_op_pre(
589        &mut self,
590        _op: &ast::ClassSetBinaryOp,
591    ) -> Result<()> {
592        if self.flags().unicode() {
593            let cls = hir::ClassUnicode::empty();
594            self.push(HirFrame::ClassUnicode(cls));
595        } else {
596            let cls = hir::ClassBytes::empty();
597            self.push(HirFrame::ClassBytes(cls));
598        }
599        Ok(())
600    }
601
602    fn visit_class_set_binary_op_in(
603        &mut self,
604        _op: &ast::ClassSetBinaryOp,
605    ) -> Result<()> {
606        if self.flags().unicode() {
607            let cls = hir::ClassUnicode::empty();
608            self.push(HirFrame::ClassUnicode(cls));
609        } else {
610            let cls = hir::ClassBytes::empty();
611            self.push(HirFrame::ClassBytes(cls));
612        }
613        Ok(())
614    }
615
616    fn visit_class_set_binary_op_post(
617        &mut self,
618        op: &ast::ClassSetBinaryOp,
619    ) -> Result<()> {
620        use crate::ast::ClassSetBinaryOpKind::*;
621
622        if self.flags().unicode() {
623            let mut rhs = self.pop().unwrap().unwrap_class_unicode();
624            let mut lhs = self.pop().unwrap().unwrap_class_unicode();
625            let mut cls = self.pop().unwrap().unwrap_class_unicode();
626            if self.flags().case_insensitive() {
627                rhs.try_case_fold_simple().map_err(|_| {
628                    self.error(
629                        op.rhs.span().clone(),
630                        ErrorKind::UnicodeCaseUnavailable,
631                    )
632                })?;
633                lhs.try_case_fold_simple().map_err(|_| {
634                    self.error(
635                        op.lhs.span().clone(),
636                        ErrorKind::UnicodeCaseUnavailable,
637                    )
638                })?;
639            }
640            match op.kind {
641                Intersection => lhs.intersect(&rhs),
642                Difference => lhs.difference(&rhs),
643                SymmetricDifference => lhs.symmetric_difference(&rhs),
644            }
645            cls.union(&lhs);
646            self.push(HirFrame::ClassUnicode(cls));
647        } else {
648            let mut rhs = self.pop().unwrap().unwrap_class_bytes();
649            let mut lhs = self.pop().unwrap().unwrap_class_bytes();
650            let mut cls = self.pop().unwrap().unwrap_class_bytes();
651            if self.flags().case_insensitive() {
652                rhs.case_fold_simple();
653                lhs.case_fold_simple();
654            }
655            match op.kind {
656                Intersection => lhs.intersect(&rhs),
657                Difference => lhs.difference(&rhs),
658                SymmetricDifference => lhs.symmetric_difference(&rhs),
659            }
660            cls.union(&lhs);
661            self.push(HirFrame::ClassBytes(cls));
662        }
663        Ok(())
664    }
665}
666
667/// The internal implementation of a translator.
668///
669/// This type is responsible for carrying around the original pattern string,
670/// which is not tied to the internal state of a translator.
671///
672/// A TranslatorI exists for the time it takes to translate a single Ast.
673#[derive(Clone, Debug)]
674struct TranslatorI<'t, 'p> {
675    trans: &'t Translator,
676    pattern: &'p str,
677}
678
679impl<'t, 'p> TranslatorI<'t, 'p> {
680    /// Build a new internal translator.
681    fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
682        TranslatorI { trans, pattern }
683    }
684
685    /// Return a reference to the underlying translator.
686    fn trans(&self) -> &Translator {
687        &self.trans
688    }
689
690    /// Push the given frame on to the call stack.
691    fn push(&self, frame: HirFrame) {
692        self.trans().stack.borrow_mut().push(frame);
693    }
694
695    /// Push the given literal char on to the call stack.
696    ///
697    /// If the top-most element of the stack is a literal, then the char
698    /// is appended to the end of that literal. Otherwise, a new literal
699    /// containing just the given char is pushed to the top of the stack.
700    fn push_char(&self, ch: char) {
701        let mut buf = [0; 4];
702        let bytes = ch.encode_utf8(&mut buf).as_bytes();
703        let mut stack = self.trans().stack.borrow_mut();
704        if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
705            literal.extend_from_slice(bytes);
706        } else {
707            stack.push(HirFrame::Literal(bytes.to_vec()));
708        }
709    }
710
711    /// Push the given literal byte on to the call stack.
712    ///
713    /// If the top-most element of the stack is a literal, then the byte
714    /// is appended to the end of that literal. Otherwise, a new literal
715    /// containing just the given byte is pushed to the top of the stack.
716    fn push_byte(&self, byte: u8) {
717        let mut stack = self.trans().stack.borrow_mut();
718        if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
719            literal.push(byte);
720        } else {
721            stack.push(HirFrame::Literal(vec![byte]));
722        }
723    }
724
725    /// Pop the top of the call stack. If the call stack is empty, return None.
726    fn pop(&self) -> Option<HirFrame> {
727        self.trans().stack.borrow_mut().pop()
728    }
729
730    /// Pop an HIR expression from the top of the stack for a concatenation.
731    ///
732    /// This returns None if the stack is empty or when a concat frame is seen.
733    /// Otherwise, it panics if it could not find an HIR expression.
734    fn pop_concat_expr(&self) -> Option<Hir> {
735        let frame = self.pop()?;
736        match frame {
737            HirFrame::Concat => None,
738            HirFrame::Expr(expr) => Some(expr),
739            HirFrame::Literal(lit) => Some(Hir::literal(lit)),
740            HirFrame::ClassUnicode(_) => {
741                unreachable!("expected expr or concat, got Unicode class")
742            }
743            HirFrame::ClassBytes(_) => {
744                unreachable!("expected expr or concat, got byte class")
745            }
746            HirFrame::Repetition => {
747                unreachable!("expected expr or concat, got repetition")
748            }
749            HirFrame::Group { .. } => {
750                unreachable!("expected expr or concat, got group")
751            }
752            HirFrame::Alternation => {
753                unreachable!("expected expr or concat, got alt marker")
754            }
755            HirFrame::AlternationBranch => {
756                unreachable!("expected expr or concat, got alt branch marker")
757            }
758        }
759    }
760
761    /// Pop an HIR expression from the top of the stack for an alternation.
762    ///
763    /// This returns None if the stack is empty or when an alternation frame is
764    /// seen. Otherwise, it panics if it could not find an HIR expression.
765    fn pop_alt_expr(&self) -> Option<Hir> {
766        let frame = self.pop()?;
767        match frame {
768            HirFrame::Alternation => None,
769            HirFrame::Expr(expr) => Some(expr),
770            HirFrame::Literal(lit) => Some(Hir::literal(lit)),
771            HirFrame::ClassUnicode(_) => {
772                unreachable!("expected expr or alt, got Unicode class")
773            }
774            HirFrame::ClassBytes(_) => {
775                unreachable!("expected expr or alt, got byte class")
776            }
777            HirFrame::Repetition => {
778                unreachable!("expected expr or alt, got repetition")
779            }
780            HirFrame::Group { .. } => {
781                unreachable!("expected expr or alt, got group")
782            }
783            HirFrame::Concat => {
784                unreachable!("expected expr or alt, got concat marker")
785            }
786            HirFrame::AlternationBranch => {
787                unreachable!("expected expr or alt, got alt branch marker")
788            }
789        }
790    }
791
792    /// Create a new error with the given span and error type.
793    fn error(&self, span: Span, kind: ErrorKind) -> Error {
794        Error { kind, pattern: self.pattern.to_string(), span }
795    }
796
797    /// Return a copy of the active flags.
798    fn flags(&self) -> Flags {
799        self.trans().flags.get()
800    }
801
802    /// Set the flags of this translator from the flags set in the given AST.
803    /// Then, return the old flags.
804    fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
805        let old_flags = self.flags();
806        let mut new_flags = Flags::from_ast(ast_flags);
807        new_flags.merge(&old_flags);
808        self.trans().flags.set(new_flags);
809        old_flags
810    }
811
812    /// Convert an Ast literal to its scalar representation.
813    ///
814    /// When Unicode mode is enabled, then this always succeeds and returns a
815    /// `char` (Unicode scalar value).
816    ///
817    /// When Unicode mode is disabled, then a `char` will still be returned
818    /// whenever possible. A byte is returned only when invalid UTF-8 is
819    /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
820    /// will result in an error when invalid UTF-8 is not allowed.
821    fn ast_literal_to_scalar(
822        &self,
823        lit: &ast::Literal,
824    ) -> Result<Either<char, u8>> {
825        if self.flags().unicode() {
826            return Ok(Either::Left(lit.c));
827        }
828        let byte = match lit.byte() {
829            None => return Ok(Either::Left(lit.c)),
830            Some(byte) => byte,
831        };
832        if byte <= 0x7F {
833            return Ok(Either::Left(char::try_from(byte).unwrap()));
834        }
835        if self.trans().utf8 {
836            return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
837        }
838        Ok(Either::Right(byte))
839    }
840
841    fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
842        if !self.flags().case_insensitive() {
843            return Ok(None);
844        }
845        if self.flags().unicode() {
846            // If case folding won't do anything, then don't bother trying.
847            let map = unicode::SimpleCaseFolder::new()
848                .map(|f| f.overlaps(c, c))
849                .map_err(|_| {
850                    self.error(span, ErrorKind::UnicodeCaseUnavailable)
851                })?;
852            if !map {
853                return Ok(None);
854            }
855            let mut cls =
856                hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
857                    c, c,
858                )]);
859            cls.try_case_fold_simple().map_err(|_| {
860                self.error(span, ErrorKind::UnicodeCaseUnavailable)
861            })?;
862            Ok(Some(Hir::class(hir::Class::Unicode(cls))))
863        } else {
864            if !c.is_ascii() {
865                return Ok(None);
866            }
867            // If case folding won't do anything, then don't bother trying.
868            match c {
869                'A'..='Z' | 'a'..='z' => {}
870                _ => return Ok(None),
871            }
872            let mut cls =
873                hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
874                    // OK because 'c.len_utf8() == 1' which in turn implies
875                    // that 'c' is ASCII.
876                    u8::try_from(c).unwrap(),
877                    u8::try_from(c).unwrap(),
878                )]);
879            cls.case_fold_simple();
880            Ok(Some(Hir::class(hir::Class::Bytes(cls))))
881        }
882    }
883
884    fn hir_dot(&self, span: Span) -> Result<Hir> {
885        let (utf8, lineterm, flags) =
886            (self.trans().utf8, self.trans().line_terminator, self.flags());
887        if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
888            return Err(self.error(span, ErrorKind::InvalidUtf8));
889        }
890        let dot = if flags.dot_matches_new_line() {
891            if flags.unicode() {
892                hir::Dot::AnyChar
893            } else {
894                hir::Dot::AnyByte
895            }
896        } else {
897            if flags.unicode() {
898                if flags.crlf() {
899                    hir::Dot::AnyCharExceptCRLF
900                } else {
901                    if !lineterm.is_ascii() {
902                        return Err(
903                            self.error(span, ErrorKind::InvalidLineTerminator)
904                        );
905                    }
906                    hir::Dot::AnyCharExcept(char::from(lineterm))
907                }
908            } else {
909                if flags.crlf() {
910                    hir::Dot::AnyByteExceptCRLF
911                } else {
912                    hir::Dot::AnyByteExcept(lineterm)
913                }
914            }
915        };
916        Ok(Hir::dot(dot))
917    }
918
919    fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
920        let unicode = self.flags().unicode();
921        let multi_line = self.flags().multi_line();
922        let crlf = self.flags().crlf();
923        Ok(match asst.kind {
924            ast::AssertionKind::StartLine => Hir::look(if multi_line {
925                if crlf {
926                    hir::Look::StartCRLF
927                } else {
928                    hir::Look::StartLF
929                }
930            } else {
931                hir::Look::Start
932            }),
933            ast::AssertionKind::EndLine => Hir::look(if multi_line {
934                if crlf {
935                    hir::Look::EndCRLF
936                } else {
937                    hir::Look::EndLF
938                }
939            } else {
940                hir::Look::End
941            }),
942            ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
943            ast::AssertionKind::EndText => Hir::look(hir::Look::End),
944            ast::AssertionKind::WordBoundary => Hir::look(if unicode {
945                hir::Look::WordUnicode
946            } else {
947                hir::Look::WordAscii
948            }),
949            ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
950                hir::Look::WordUnicodeNegate
951            } else {
952                hir::Look::WordAsciiNegate
953            }),
954            ast::AssertionKind::WordBoundaryStart
955            | ast::AssertionKind::WordBoundaryStartAngle => {
956                Hir::look(if unicode {
957                    hir::Look::WordStartUnicode
958                } else {
959                    hir::Look::WordStartAscii
960                })
961            }
962            ast::AssertionKind::WordBoundaryEnd
963            | ast::AssertionKind::WordBoundaryEndAngle => {
964                Hir::look(if unicode {
965                    hir::Look::WordEndUnicode
966                } else {
967                    hir::Look::WordEndAscii
968                })
969            }
970            ast::AssertionKind::WordBoundaryStartHalf => {
971                Hir::look(if unicode {
972                    hir::Look::WordStartHalfUnicode
973                } else {
974                    hir::Look::WordStartHalfAscii
975                })
976            }
977            ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
978                hir::Look::WordEndHalfUnicode
979            } else {
980                hir::Look::WordEndHalfAscii
981            }),
982        })
983    }
984
985    fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
986        let (index, name) = match group.kind {
987            ast::GroupKind::CaptureIndex(index) => (index, None),
988            ast::GroupKind::CaptureName { ref name, .. } => {
989                (name.index, Some(name.name.clone().into_boxed_str()))
990            }
991            // The HIR doesn't need to use non-capturing groups, since the way
992            // in which the data type is defined handles this automatically.
993            ast::GroupKind::NonCapturing(_) => return expr,
994        };
995        Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
996    }
997
998    fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
999        let (min, max) = match rep.op.kind {
1000            ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
1001            ast::RepetitionKind::ZeroOrMore => (0, None),
1002            ast::RepetitionKind::OneOrMore => (1, None),
1003            ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
1004                (m, Some(m))
1005            }
1006            ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
1007                (m, None)
1008            }
1009            ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
1010                m,
1011                n,
1012            )) => (m, Some(n)),
1013        };
1014        let greedy =
1015            if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
1016        Hir::repetition(hir::Repetition {
1017            min,
1018            max,
1019            greedy,
1020            sub: Box::new(expr),
1021        })
1022    }
1023
1024    fn hir_unicode_class(
1025        &self,
1026        ast_class: &ast::ClassUnicode,
1027    ) -> Result<hir::ClassUnicode> {
1028        use crate::ast::ClassUnicodeKind::*;
1029
1030        if !self.flags().unicode() {
1031            return Err(
1032                self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
1033            );
1034        }
1035        let query = match ast_class.kind {
1036            OneLetter(name) => ClassQuery::OneLetter(name),
1037            Named(ref name) => ClassQuery::Binary(name),
1038            NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
1039                property_name: name,
1040                property_value: value,
1041            },
1042        };
1043        let mut result = self.convert_unicode_class_error(
1044            &ast_class.span,
1045            unicode::class(query),
1046        );
1047        if let Ok(ref mut class) = result {
1048            self.unicode_fold_and_negate(
1049                &ast_class.span,
1050                ast_class.is_negated(),
1051                class,
1052            )?;
1053        }
1054        result
1055    }
1056
1057    fn hir_ascii_unicode_class(
1058        &self,
1059        ast: &ast::ClassAscii,
1060    ) -> Result<hir::ClassUnicode> {
1061        let mut cls = hir::ClassUnicode::new(
1062            ascii_class_as_chars(&ast.kind)
1063                .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1064        );
1065        self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1066        Ok(cls)
1067    }
1068
1069    fn hir_ascii_byte_class(
1070        &self,
1071        ast: &ast::ClassAscii,
1072    ) -> Result<hir::ClassBytes> {
1073        let mut cls = hir::ClassBytes::new(
1074            ascii_class(&ast.kind)
1075                .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1076        );
1077        self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1078        Ok(cls)
1079    }
1080
1081    fn hir_perl_unicode_class(
1082        &self,
1083        ast_class: &ast::ClassPerl,
1084    ) -> Result<hir::ClassUnicode> {
1085        use crate::ast::ClassPerlKind::*;
1086
1087        assert!(self.flags().unicode());
1088        let result = match ast_class.kind {
1089            Digit => unicode::perl_digit(),
1090            Space => unicode::perl_space(),
1091            Word => unicode::perl_word(),
1092        };
1093        let mut class =
1094            self.convert_unicode_class_error(&ast_class.span, result)?;
1095        // We needn't apply case folding here because the Perl Unicode classes
1096        // are already closed under Unicode simple case folding.
1097        if ast_class.negated {
1098            class.negate();
1099        }
1100        Ok(class)
1101    }
1102
1103    fn hir_perl_byte_class(
1104        &self,
1105        ast_class: &ast::ClassPerl,
1106    ) -> Result<hir::ClassBytes> {
1107        use crate::ast::ClassPerlKind::*;
1108
1109        assert!(!self.flags().unicode());
1110        let mut class = match ast_class.kind {
1111            Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
1112            Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
1113            Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
1114        };
1115        // We needn't apply case folding here because the Perl ASCII classes
1116        // are already closed (under ASCII case folding).
1117        if ast_class.negated {
1118            class.negate();
1119        }
1120        // Negating a Perl byte class is likely to cause it to match invalid
1121        // UTF-8. That's only OK if the translator is configured to allow such
1122        // things.
1123        if self.trans().utf8 && !class.is_ascii() {
1124            return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
1125        }
1126        Ok(class)
1127    }
1128
1129    /// Converts the given Unicode specific error to an HIR translation error.
1130    ///
1131    /// The span given should approximate the position at which an error would
1132    /// occur.
1133    fn convert_unicode_class_error(
1134        &self,
1135        span: &Span,
1136        result: core::result::Result<hir::ClassUnicode, unicode::Error>,
1137    ) -> Result<hir::ClassUnicode> {
1138        result.map_err(|err| {
1139            let sp = span.clone();
1140            match err {
1141                unicode::Error::PropertyNotFound => {
1142                    self.error(sp, ErrorKind::UnicodePropertyNotFound)
1143                }
1144                unicode::Error::PropertyValueNotFound => {
1145                    self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
1146                }
1147                unicode::Error::PerlClassNotFound => {
1148                    self.error(sp, ErrorKind::UnicodePerlClassNotFound)
1149                }
1150            }
1151        })
1152    }
1153
1154    fn unicode_fold_and_negate(
1155        &self,
1156        span: &Span,
1157        negated: bool,
1158        class: &mut hir::ClassUnicode,
1159    ) -> Result<()> {
1160        // Note that we must apply case folding before negation!
1161        // Consider `(?i)[^x]`. If we applied negation first, then
1162        // the result would be the character class that matched any
1163        // Unicode scalar value.
1164        if self.flags().case_insensitive() {
1165            class.try_case_fold_simple().map_err(|_| {
1166                self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
1167            })?;
1168        }
1169        if negated {
1170            class.negate();
1171        }
1172        Ok(())
1173    }
1174
1175    fn bytes_fold_and_negate(
1176        &self,
1177        span: &Span,
1178        negated: bool,
1179        class: &mut hir::ClassBytes,
1180    ) -> Result<()> {
1181        // Note that we must apply case folding before negation!
1182        // Consider `(?i)[^x]`. If we applied negation first, then
1183        // the result would be the character class that matched any
1184        // Unicode scalar value.
1185        if self.flags().case_insensitive() {
1186            class.case_fold_simple();
1187        }
1188        if negated {
1189            class.negate();
1190        }
1191        if self.trans().utf8 && !class.is_ascii() {
1192            return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
1193        }
1194        Ok(())
1195    }
1196
1197    /// Return a scalar byte value suitable for use as a literal in a byte
1198    /// character class.
1199    fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
1200        match self.ast_literal_to_scalar(ast)? {
1201            Either::Right(byte) => Ok(byte),
1202            Either::Left(ch) => {
1203                if ch.is_ascii() {
1204                    Ok(u8::try_from(ch).unwrap())
1205                } else {
1206                    // We can't feasibly support Unicode in
1207                    // byte oriented classes. Byte classes don't
1208                    // do Unicode case folding.
1209                    Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
1210                }
1211            }
1212        }
1213    }
1214}
1215
1216/// A translator's representation of a regular expression's flags at any given
1217/// moment in time.
1218///
1219/// Each flag can be in one of three states: absent, present but disabled or
1220/// present but enabled.
1221#[derive(Clone, Copy, Debug, Default)]
1222struct Flags {
1223    case_insensitive: Option<bool>,
1224    multi_line: Option<bool>,
1225    dot_matches_new_line: Option<bool>,
1226    swap_greed: Option<bool>,
1227    unicode: Option<bool>,
1228    crlf: Option<bool>,
1229    // Note that `ignore_whitespace` is omitted here because it is handled
1230    // entirely in the parser.
1231}
1232
1233impl Flags {
1234    fn from_ast(ast: &ast::Flags) -> Flags {
1235        let mut flags = Flags::default();
1236        let mut enable = true;
1237        for item in &ast.items {
1238            match item.kind {
1239                ast::FlagsItemKind::Negation => {
1240                    enable = false;
1241                }
1242                ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1243                    flags.case_insensitive = Some(enable);
1244                }
1245                ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1246                    flags.multi_line = Some(enable);
1247                }
1248                ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1249                    flags.dot_matches_new_line = Some(enable);
1250                }
1251                ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1252                    flags.swap_greed = Some(enable);
1253                }
1254                ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1255                    flags.unicode = Some(enable);
1256                }
1257                ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
1258                    flags.crlf = Some(enable);
1259                }
1260                ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1261            }
1262        }
1263        flags
1264    }
1265
1266    fn merge(&mut self, previous: &Flags) {
1267        if self.case_insensitive.is_none() {
1268            self.case_insensitive = previous.case_insensitive;
1269        }
1270        if self.multi_line.is_none() {
1271            self.multi_line = previous.multi_line;
1272        }
1273        if self.dot_matches_new_line.is_none() {
1274            self.dot_matches_new_line = previous.dot_matches_new_line;
1275        }
1276        if self.swap_greed.is_none() {
1277            self.swap_greed = previous.swap_greed;
1278        }
1279        if self.unicode.is_none() {
1280            self.unicode = previous.unicode;
1281        }
1282        if self.crlf.is_none() {
1283            self.crlf = previous.crlf;
1284        }
1285    }
1286
1287    fn case_insensitive(&self) -> bool {
1288        self.case_insensitive.unwrap_or(false)
1289    }
1290
1291    fn multi_line(&self) -> bool {
1292        self.multi_line.unwrap_or(false)
1293    }
1294
1295    fn dot_matches_new_line(&self) -> bool {
1296        self.dot_matches_new_line.unwrap_or(false)
1297    }
1298
1299    fn swap_greed(&self) -> bool {
1300        self.swap_greed.unwrap_or(false)
1301    }
1302
1303    fn unicode(&self) -> bool {
1304        self.unicode.unwrap_or(true)
1305    }
1306
1307    fn crlf(&self) -> bool {
1308        self.crlf.unwrap_or(false)
1309    }
1310}
1311
1312fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1313    let ranges: Vec<_> = ascii_class(kind)
1314        .map(|(s, e)| hir::ClassBytesRange::new(s, e))
1315        .collect();
1316    hir::ClassBytes::new(ranges)
1317}
1318
1319fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
1320    use crate::ast::ClassAsciiKind::*;
1321
1322    let slice: &'static [(u8, u8)] = match *kind {
1323        Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1324        Alpha => &[(b'A', b'Z'), (b'a', b'z')],
1325        Ascii => &[(b'\x00', b'\x7F')],
1326        Blank => &[(b'\t', b'\t'), (b' ', b' ')],
1327        Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1328        Digit => &[(b'0', b'9')],
1329        Graph => &[(b'!', b'~')],
1330        Lower => &[(b'a', b'z')],
1331        Print => &[(b' ', b'~')],
1332        Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1333        Space => &[
1334            (b'\t', b'\t'),
1335            (b'\n', b'\n'),
1336            (b'\x0B', b'\x0B'),
1337            (b'\x0C', b'\x0C'),
1338            (b'\r', b'\r'),
1339            (b' ', b' '),
1340        ],
1341        Upper => &[(b'A', b'Z')],
1342        Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1343        Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1344    };
1345    slice.iter().copied()
1346}
1347
1348fn ascii_class_as_chars(
1349    kind: &ast::ClassAsciiKind,
1350) -> impl Iterator<Item = (char, char)> {
1351    ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e)))
1352}
1353
1354#[cfg(test)]
1355mod tests {
1356    use crate::{
1357        ast::{parse::ParserBuilder, Position},
1358        hir::{Look, Properties},
1359    };
1360
1361    use super::*;
1362
1363    // We create these errors to compare with real hir::Errors in the tests.
1364    // We define equality between TestError and hir::Error to disregard the
1365    // pattern string in hir::Error, which is annoying to provide in tests.
1366    #[derive(Clone, Debug)]
1367    struct TestError {
1368        span: Span,
1369        kind: hir::ErrorKind,
1370    }
1371
1372    impl PartialEq<hir::Error> for TestError {
1373        fn eq(&self, other: &hir::Error) -> bool {
1374            self.span == other.span && self.kind == other.kind
1375        }
1376    }
1377
1378    impl PartialEq<TestError> for hir::Error {
1379        fn eq(&self, other: &TestError) -> bool {
1380            self.span == other.span && self.kind == other.kind
1381        }
1382    }
1383
1384    fn parse(pattern: &str) -> Ast {
1385        ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1386    }
1387
1388    fn t(pattern: &str) -> Hir {
1389        TranslatorBuilder::new()
1390            .utf8(true)
1391            .build()
1392            .translate(pattern, &parse(pattern))
1393            .unwrap()
1394    }
1395
1396    fn t_err(pattern: &str) -> hir::Error {
1397        TranslatorBuilder::new()
1398            .utf8(true)
1399            .build()
1400            .translate(pattern, &parse(pattern))
1401            .unwrap_err()
1402    }
1403
1404    fn t_bytes(pattern: &str) -> Hir {
1405        TranslatorBuilder::new()
1406            .utf8(false)
1407            .build()
1408            .translate(pattern, &parse(pattern))
1409            .unwrap()
1410    }
1411
1412    fn props(pattern: &str) -> Properties {
1413        t(pattern).properties().clone()
1414    }
1415
1416    fn props_bytes(pattern: &str) -> Properties {
1417        t_bytes(pattern).properties().clone()
1418    }
1419
1420    fn hir_lit(s: &str) -> Hir {
1421        hir_blit(s.as_bytes())
1422    }
1423
1424    fn hir_blit(s: &[u8]) -> Hir {
1425        Hir::literal(s)
1426    }
1427
1428    fn hir_capture(index: u32, expr: Hir) -> Hir {
1429        Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
1430    }
1431
1432    fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
1433        Hir::capture(hir::Capture {
1434            index,
1435            name: Some(name.into()),
1436            sub: Box::new(expr),
1437        })
1438    }
1439
1440    fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1441        Hir::repetition(hir::Repetition {
1442            min: 0,
1443            max: Some(1),
1444            greedy,
1445            sub: Box::new(expr),
1446        })
1447    }
1448
1449    fn hir_star(greedy: bool, expr: Hir) -> Hir {
1450        Hir::repetition(hir::Repetition {
1451            min: 0,
1452            max: None,
1453            greedy,
1454            sub: Box::new(expr),
1455        })
1456    }
1457
1458    fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1459        Hir::repetition(hir::Repetition {
1460            min: 1,
1461            max: None,
1462            greedy,
1463            sub: Box::new(expr),
1464        })
1465    }
1466
1467    fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
1468        Hir::repetition(hir::Repetition {
1469            min,
1470            max,
1471            greedy,
1472            sub: Box::new(expr),
1473        })
1474    }
1475
1476    fn hir_alt(alts: Vec<Hir>) -> Hir {
1477        Hir::alternation(alts)
1478    }
1479
1480    fn hir_cat(exprs: Vec<Hir>) -> Hir {
1481        Hir::concat(exprs)
1482    }
1483
1484    #[allow(dead_code)]
1485    fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1486        Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1487    }
1488
1489    #[allow(dead_code)]
1490    fn hir_uclass_perl_word() -> Hir {
1491        Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1492    }
1493
1494    fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
1495        Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
1496            ascii_class_as_chars(kind)
1497                .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1498        )))
1499    }
1500
1501    fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
1502        Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
1503            ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1504        )))
1505    }
1506
1507    fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1508        Hir::class(uclass(ranges))
1509    }
1510
1511    fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1512        Hir::class(bclass(ranges))
1513    }
1514
1515    fn hir_case_fold(expr: Hir) -> Hir {
1516        match expr.into_kind() {
1517            HirKind::Class(mut cls) => {
1518                cls.case_fold_simple();
1519                Hir::class(cls)
1520            }
1521            _ => panic!("cannot case fold non-class Hir expr"),
1522        }
1523    }
1524
1525    fn hir_negate(expr: Hir) -> Hir {
1526        match expr.into_kind() {
1527            HirKind::Class(mut cls) => {
1528                cls.negate();
1529                Hir::class(cls)
1530            }
1531            _ => panic!("cannot negate non-class Hir expr"),
1532        }
1533    }
1534
1535    fn uclass(ranges: &[(char, char)]) -> hir::Class {
1536        let ranges: Vec<hir::ClassUnicodeRange> = ranges
1537            .iter()
1538            .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1539            .collect();
1540        hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1541    }
1542
1543    fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1544        let ranges: Vec<hir::ClassBytesRange> = ranges
1545            .iter()
1546            .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1547            .collect();
1548        hir::Class::Bytes(hir::ClassBytes::new(ranges))
1549    }
1550
1551    #[cfg(feature = "unicode-case")]
1552    fn class_case_fold(mut cls: hir::Class) -> Hir {
1553        cls.case_fold_simple();
1554        Hir::class(cls)
1555    }
1556
1557    fn class_negate(mut cls: hir::Class) -> Hir {
1558        cls.negate();
1559        Hir::class(cls)
1560    }
1561
1562    #[allow(dead_code)]
1563    fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1564        use crate::hir::Class::{Bytes, Unicode};
1565
1566        match (expr1.into_kind(), expr2.into_kind()) {
1567            (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1568                c1.union(&c2);
1569                Hir::class(hir::Class::Unicode(c1))
1570            }
1571            (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1572                c1.union(&c2);
1573                Hir::class(hir::Class::Bytes(c1))
1574            }
1575            _ => panic!("cannot union non-class Hir exprs"),
1576        }
1577    }
1578
1579    #[allow(dead_code)]
1580    fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1581        use crate::hir::Class::{Bytes, Unicode};
1582
1583        match (expr1.into_kind(), expr2.into_kind()) {
1584            (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1585                c1.difference(&c2);
1586                Hir::class(hir::Class::Unicode(c1))
1587            }
1588            (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1589                c1.difference(&c2);
1590                Hir::class(hir::Class::Bytes(c1))
1591            }
1592            _ => panic!("cannot difference non-class Hir exprs"),
1593        }
1594    }
1595
1596    fn hir_look(look: hir::Look) -> Hir {
1597        Hir::look(look)
1598    }
1599
1600    #[test]
1601    fn empty() {
1602        assert_eq!(t(""), Hir::empty());
1603        assert_eq!(t("(?i)"), Hir::empty());
1604        assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1605        assert_eq!(t("(?:)"), Hir::empty());
1606        assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
1607        assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1608        assert_eq!(
1609            t("()|()"),
1610            hir_alt(vec![
1611                hir_capture(1, Hir::empty()),
1612                hir_capture(2, Hir::empty()),
1613            ])
1614        );
1615        assert_eq!(
1616            t("(|b)"),
1617            hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1618        );
1619        assert_eq!(
1620            t("(a|)"),
1621            hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1622        );
1623        assert_eq!(
1624            t("(a||c)"),
1625            hir_capture(
1626                1,
1627                hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1628            )
1629        );
1630        assert_eq!(
1631            t("(||)"),
1632            hir_capture(
1633                1,
1634                hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1635            )
1636        );
1637    }
1638
1639    #[test]
1640    fn literal() {
1641        assert_eq!(t("a"), hir_lit("a"));
1642        assert_eq!(t("(?-u)a"), hir_lit("a"));
1643        assert_eq!(t("☃"), hir_lit("☃"));
1644        assert_eq!(t("abcd"), hir_lit("abcd"));
1645
1646        assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1647        assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1648        assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1649        assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1650
1651        assert_eq!(t("(?-u)☃"), hir_lit("☃"));
1652        assert_eq!(
1653            t_err(r"(?-u)\xFF"),
1654            TestError {
1655                kind: hir::ErrorKind::InvalidUtf8,
1656                span: Span::new(
1657                    Position::new(5, 1, 6),
1658                    Position::new(9, 1, 10)
1659                ),
1660            }
1661        );
1662    }
1663
1664    #[test]
1665    fn literal_case_insensitive() {
1666        #[cfg(feature = "unicode-case")]
1667        assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1668        #[cfg(feature = "unicode-case")]
1669        assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
1670        #[cfg(feature = "unicode-case")]
1671        assert_eq!(
1672            t("a(?i)a(?-i)a"),
1673            hir_cat(vec![
1674                hir_lit("a"),
1675                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1676                hir_lit("a"),
1677            ])
1678        );
1679        #[cfg(feature = "unicode-case")]
1680        assert_eq!(
1681            t("(?i)ab@c"),
1682            hir_cat(vec![
1683                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1684                hir_uclass(&[('B', 'B'), ('b', 'b')]),
1685                hir_lit("@"),
1686                hir_uclass(&[('C', 'C'), ('c', 'c')]),
1687            ])
1688        );
1689        #[cfg(feature = "unicode-case")]
1690        assert_eq!(
1691            t("(?i)β"),
1692            hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1693        );
1694
1695        assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1696        #[cfg(feature = "unicode-case")]
1697        assert_eq!(
1698            t("(?-u)a(?i)a(?-i)a"),
1699            hir_cat(vec![
1700                hir_lit("a"),
1701                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1702                hir_lit("a"),
1703            ])
1704        );
1705        assert_eq!(
1706            t("(?i-u)ab@c"),
1707            hir_cat(vec![
1708                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1709                hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1710                hir_lit("@"),
1711                hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1712            ])
1713        );
1714
1715        assert_eq!(
1716            t_bytes("(?i-u)a"),
1717            hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1718        );
1719        assert_eq!(
1720            t_bytes("(?i-u)\x61"),
1721            hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1722        );
1723        assert_eq!(
1724            t_bytes(r"(?i-u)\x61"),
1725            hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1726        );
1727        assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1728
1729        assert_eq!(t("(?i-u)β"), hir_lit("β"),);
1730    }
1731
1732    #[test]
1733    fn dot() {
1734        assert_eq!(
1735            t("."),
1736            hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
1737        );
1738        assert_eq!(
1739            t("(?R)."),
1740            hir_uclass(&[
1741                ('\0', '\t'),
1742                ('\x0B', '\x0C'),
1743                ('\x0E', '\u{10FFFF}'),
1744            ])
1745        );
1746        assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1747        assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1748        assert_eq!(
1749            t_bytes("(?-u)."),
1750            hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
1751        );
1752        assert_eq!(
1753            t_bytes("(?R-u)."),
1754            hir_bclass(&[
1755                (b'\0', b'\t'),
1756                (b'\x0B', b'\x0C'),
1757                (b'\x0E', b'\xFF'),
1758            ])
1759        );
1760        assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1761        assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1762
1763        // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1764        assert_eq!(
1765            t_err("(?-u)."),
1766            TestError {
1767                kind: hir::ErrorKind::InvalidUtf8,
1768                span: Span::new(
1769                    Position::new(5, 1, 6),
1770                    Position::new(6, 1, 7)
1771                ),
1772            }
1773        );
1774        assert_eq!(
1775            t_err("(?R-u)."),
1776            TestError {
1777                kind: hir::ErrorKind::InvalidUtf8,
1778                span: Span::new(
1779                    Position::new(6, 1, 7),
1780                    Position::new(7, 1, 8)
1781                ),
1782            }
1783        );
1784        assert_eq!(
1785            t_err("(?s-u)."),
1786            TestError {
1787                kind: hir::ErrorKind::InvalidUtf8,
1788                span: Span::new(
1789                    Position::new(6, 1, 7),
1790                    Position::new(7, 1, 8)
1791                ),
1792            }
1793        );
1794        assert_eq!(
1795            t_err("(?Rs-u)."),
1796            TestError {
1797                kind: hir::ErrorKind::InvalidUtf8,
1798                span: Span::new(
1799                    Position::new(7, 1, 8),
1800                    Position::new(8, 1, 9)
1801                ),
1802            }
1803        );
1804    }
1805
1806    #[test]
1807    fn assertions() {
1808        assert_eq!(t("^"), hir_look(hir::Look::Start));
1809        assert_eq!(t("$"), hir_look(hir::Look::End));
1810        assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1811        assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1812        assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1813        assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1814        assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1815        assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1816
1817        assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1818        assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1819        assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1820        assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
1821    }
1822
1823    #[test]
1824    fn group() {
1825        assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
1826        assert_eq!(
1827            t("(a)(b)"),
1828            hir_cat(vec![
1829                hir_capture(1, hir_lit("a")),
1830                hir_capture(2, hir_lit("b")),
1831            ])
1832        );
1833        assert_eq!(
1834            t("(a)|(b)"),
1835            hir_alt(vec![
1836                hir_capture(1, hir_lit("a")),
1837                hir_capture(2, hir_lit("b")),
1838            ])
1839        );
1840        assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
1841        assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
1842        assert_eq!(
1843            t("(?P<foo>a)(?P<bar>b)"),
1844            hir_cat(vec![
1845                hir_capture_name(1, "foo", hir_lit("a")),
1846                hir_capture_name(2, "bar", hir_lit("b")),
1847            ])
1848        );
1849        assert_eq!(t("(?:)"), Hir::empty());
1850        assert_eq!(t("(?:a)"), hir_lit("a"));
1851        assert_eq!(
1852            t("(?:a)(b)"),
1853            hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
1854        );
1855        assert_eq!(
1856            t("(a)(?:b)(c)"),
1857            hir_cat(vec![
1858                hir_capture(1, hir_lit("a")),
1859                hir_lit("b"),
1860                hir_capture(2, hir_lit("c")),
1861            ])
1862        );
1863        assert_eq!(
1864            t("(a)(?P<foo>b)(c)"),
1865            hir_cat(vec![
1866                hir_capture(1, hir_lit("a")),
1867                hir_capture_name(2, "foo", hir_lit("b")),
1868                hir_capture(3, hir_lit("c")),
1869            ])
1870        );
1871        assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1872        assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
1873        assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
1874        assert_eq!(
1875            t("(((?x)))"),
1876            hir_capture(1, hir_capture(2, Hir::empty()))
1877        );
1878    }
1879
1880    #[test]
1881    fn line_anchors() {
1882        assert_eq!(t("^"), hir_look(hir::Look::Start));
1883        assert_eq!(t("$"), hir_look(hir::Look::End));
1884        assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1885        assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1886
1887        assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1888        assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1889        assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1890        assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1891
1892        assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
1893        assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
1894        assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
1895        assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
1896
1897        assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
1898        assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
1899        assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
1900        assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
1901    }
1902
1903    #[test]
1904    fn flags() {
1905        #[cfg(feature = "unicode-case")]
1906        assert_eq!(
1907            t("(?i:a)a"),
1908            hir_cat(
1909                vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1910            )
1911        );
1912        assert_eq!(
1913            t("(?i-u:a)β"),
1914            hir_cat(vec![
1915                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1916                hir_lit("β"),
1917            ])
1918        );
1919        assert_eq!(
1920            t("(?:(?i-u)a)b"),
1921            hir_cat(vec![
1922                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1923                hir_lit("b"),
1924            ])
1925        );
1926        assert_eq!(
1927            t("((?i-u)a)b"),
1928            hir_cat(vec![
1929                hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1930                hir_lit("b"),
1931            ])
1932        );
1933        #[cfg(feature = "unicode-case")]
1934        assert_eq!(
1935            t("(?i)(?-i:a)a"),
1936            hir_cat(
1937                vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1938            )
1939        );
1940        #[cfg(feature = "unicode-case")]
1941        assert_eq!(
1942            t("(?im)a^"),
1943            hir_cat(vec![
1944                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1945                hir_look(hir::Look::StartLF),
1946            ])
1947        );
1948        #[cfg(feature = "unicode-case")]
1949        assert_eq!(
1950            t("(?im)a^(?i-m)a^"),
1951            hir_cat(vec![
1952                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1953                hir_look(hir::Look::StartLF),
1954                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1955                hir_look(hir::Look::Start),
1956            ])
1957        );
1958        assert_eq!(
1959            t("(?U)a*a*?(?-U)a*a*?"),
1960            hir_cat(vec![
1961                hir_star(false, hir_lit("a")),
1962                hir_star(true, hir_lit("a")),
1963                hir_star(true, hir_lit("a")),
1964                hir_star(false, hir_lit("a")),
1965            ])
1966        );
1967        #[cfg(feature = "unicode-case")]
1968        assert_eq!(
1969            t("(?:a(?i)a)a"),
1970            hir_cat(vec![
1971                hir_cat(vec![
1972                    hir_lit("a"),
1973                    hir_uclass(&[('A', 'A'), ('a', 'a')]),
1974                ]),
1975                hir_lit("a"),
1976            ])
1977        );
1978        #[cfg(feature = "unicode-case")]
1979        assert_eq!(
1980            t("(?i)(?:a(?-i)a)a"),
1981            hir_cat(vec![
1982                hir_cat(vec![
1983                    hir_uclass(&[('A', 'A'), ('a', 'a')]),
1984                    hir_lit("a"),
1985                ]),
1986                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1987            ])
1988        );
1989    }
1990
1991    #[test]
1992    fn escape() {
1993        assert_eq!(
1994            t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1995            hir_lit(r"\.+*?()|[]{}^$#")
1996        );
1997    }
1998
1999    #[test]
2000    fn repetition() {
2001        assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
2002        assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
2003        assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
2004        assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
2005        assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
2006        assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
2007
2008        assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
2009        assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
2010        assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
2011        assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
2012        assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
2013        assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
2014
2015        assert_eq!(
2016            t("ab?"),
2017            hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2018        );
2019        assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
2020        assert_eq!(
2021            t("a|b?"),
2022            hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2023        );
2024    }
2025
2026    #[test]
2027    fn cat_alt() {
2028        let a = || hir_look(hir::Look::Start);
2029        let b = || hir_look(hir::Look::End);
2030        let c = || hir_look(hir::Look::WordUnicode);
2031        let d = || hir_look(hir::Look::WordUnicodeNegate);
2032
2033        assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
2034        assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
2035        assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
2036        assert_eq!(
2037            t(r"^$|$\b|\b\B"),
2038            hir_alt(vec![
2039                hir_cat(vec![a(), b()]),
2040                hir_cat(vec![b(), c()]),
2041                hir_cat(vec![c(), d()]),
2042            ])
2043        );
2044        assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
2045        assert_eq!(
2046            t(r"(^|$|\b)"),
2047            hir_capture(1, hir_alt(vec![a(), b(), c()]))
2048        );
2049        assert_eq!(
2050            t(r"(^$|$\b|\b\B)"),
2051            hir_capture(
2052                1,
2053                hir_alt(vec![
2054                    hir_cat(vec![a(), b()]),
2055                    hir_cat(vec![b(), c()]),
2056                    hir_cat(vec![c(), d()]),
2057                ])
2058            )
2059        );
2060        assert_eq!(
2061            t(r"(^$|($\b|(\b\B)))"),
2062            hir_capture(
2063                1,
2064                hir_alt(vec![
2065                    hir_cat(vec![a(), b()]),
2066                    hir_capture(
2067                        2,
2068                        hir_alt(vec![
2069                            hir_cat(vec![b(), c()]),
2070                            hir_capture(3, hir_cat(vec![c(), d()])),
2071                        ])
2072                    ),
2073                ])
2074            )
2075        );
2076    }
2077
2078    // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
2079    // '[A-Za-z]'. In other words, an alternation of just classes is always
2080    // equivalent to a single class corresponding to the union of the branches
2081    // in that class. (Unless some branches match invalid UTF-8 and others
2082    // match non-ASCII Unicode.)
2083    #[test]
2084    fn cat_class_flattened() {
2085        assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2086        // Combining all of the letter properties should give us the one giant
2087        // letter property.
2088        #[cfg(feature = "unicode-gencat")]
2089        assert_eq!(
2090            t(r"(?x)
2091                \p{Lowercase_Letter}
2092                |\p{Uppercase_Letter}
2093                |\p{Titlecase_Letter}
2094                |\p{Modifier_Letter}
2095                |\p{Other_Letter}
2096            "),
2097            hir_uclass_query(ClassQuery::Binary("letter"))
2098        );
2099        // Byte classes that can truly match invalid UTF-8 cannot be combined
2100        // with Unicode classes.
2101        assert_eq!(
2102            t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
2103            hir_alt(vec![
2104                hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
2105                hir_bclass(&[(b'\x90', b'\xFF')]),
2106                hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
2107            ])
2108        );
2109        // Byte classes on their own can be combined, even if some are ASCII
2110        // and others are invalid UTF-8.
2111        assert_eq!(
2112            t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
2113            hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
2114        );
2115    }
2116
2117    #[test]
2118    fn class_ascii() {
2119        assert_eq!(
2120            t("[[:alnum:]]"),
2121            hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
2122        );
2123        assert_eq!(
2124            t("[[:alpha:]]"),
2125            hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
2126        );
2127        assert_eq!(
2128            t("[[:ascii:]]"),
2129            hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
2130        );
2131        assert_eq!(
2132            t("[[:blank:]]"),
2133            hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
2134        );
2135        assert_eq!(
2136            t("[[:cntrl:]]"),
2137            hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
2138        );
2139        assert_eq!(
2140            t("[[:digit:]]"),
2141            hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
2142        );
2143        assert_eq!(
2144            t("[[:graph:]]"),
2145            hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
2146        );
2147        assert_eq!(
2148            t("[[:lower:]]"),
2149            hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
2150        );
2151        assert_eq!(
2152            t("[[:print:]]"),
2153            hir_ascii_uclass(&ast::ClassAsciiKind::Print)
2154        );
2155        assert_eq!(
2156            t("[[:punct:]]"),
2157            hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
2158        );
2159        assert_eq!(
2160            t("[[:space:]]"),
2161            hir_ascii_uclass(&ast::ClassAsciiKind::Space)
2162        );
2163        assert_eq!(
2164            t("[[:upper:]]"),
2165            hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
2166        );
2167        assert_eq!(
2168            t("[[:word:]]"),
2169            hir_ascii_uclass(&ast::ClassAsciiKind::Word)
2170        );
2171        assert_eq!(
2172            t("[[:xdigit:]]"),
2173            hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
2174        );
2175
2176        assert_eq!(
2177            t("[[:^lower:]]"),
2178            hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
2179        );
2180        #[cfg(feature = "unicode-case")]
2181        assert_eq!(
2182            t("(?i)[[:lower:]]"),
2183            hir_uclass(&[
2184                ('A', 'Z'),
2185                ('a', 'z'),
2186                ('\u{17F}', '\u{17F}'),
2187                ('\u{212A}', '\u{212A}'),
2188            ])
2189        );
2190
2191        assert_eq!(
2192            t("(?-u)[[:lower:]]"),
2193            hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
2194        );
2195        assert_eq!(
2196            t("(?i-u)[[:lower:]]"),
2197            hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
2198        );
2199
2200        assert_eq!(
2201            t_err("(?-u)[[:^lower:]]"),
2202            TestError {
2203                kind: hir::ErrorKind::InvalidUtf8,
2204                span: Span::new(
2205                    Position::new(6, 1, 7),
2206                    Position::new(16, 1, 17)
2207                ),
2208            }
2209        );
2210        assert_eq!(
2211            t_err("(?i-u)[[:^lower:]]"),
2212            TestError {
2213                kind: hir::ErrorKind::InvalidUtf8,
2214                span: Span::new(
2215                    Position::new(7, 1, 8),
2216                    Position::new(17, 1, 18)
2217                ),
2218            }
2219        );
2220    }
2221
2222    #[test]
2223    fn class_ascii_multiple() {
2224        // See: https://github.com/rust-lang/regex/issues/680
2225        assert_eq!(
2226            t("[[:alnum:][:^ascii:]]"),
2227            hir_union(
2228                hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
2229                hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
2230            ),
2231        );
2232        assert_eq!(
2233            t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
2234            hir_union(
2235                hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
2236                hir_bclass(&[(0x80, 0xFF)]),
2237            ),
2238        );
2239    }
2240
2241    #[test]
2242    #[cfg(feature = "unicode-perl")]
2243    fn class_perl_unicode() {
2244        // Unicode
2245        assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
2246        assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
2247        assert_eq!(t(r"\w"), hir_uclass_perl_word());
2248        #[cfg(feature = "unicode-case")]
2249        assert_eq!(
2250            t(r"(?i)\d"),
2251            hir_uclass_query(ClassQuery::Binary("digit"))
2252        );
2253        #[cfg(feature = "unicode-case")]
2254        assert_eq!(
2255            t(r"(?i)\s"),
2256            hir_uclass_query(ClassQuery::Binary("space"))
2257        );
2258        #[cfg(feature = "unicode-case")]
2259        assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2260
2261        // Unicode, negated
2262        assert_eq!(
2263            t(r"\D"),
2264            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2265        );
2266        assert_eq!(
2267            t(r"\S"),
2268            hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2269        );
2270        assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2271        #[cfg(feature = "unicode-case")]
2272        assert_eq!(
2273            t(r"(?i)\D"),
2274            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2275        );
2276        #[cfg(feature = "unicode-case")]
2277        assert_eq!(
2278            t(r"(?i)\S"),
2279            hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2280        );
2281        #[cfg(feature = "unicode-case")]
2282        assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2283    }
2284
2285    #[test]
2286    fn class_perl_ascii() {
2287        // ASCII only
2288        assert_eq!(
2289            t(r"(?-u)\d"),
2290            hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2291        );
2292        assert_eq!(
2293            t(r"(?-u)\s"),
2294            hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2295        );
2296        assert_eq!(
2297            t(r"(?-u)\w"),
2298            hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2299        );
2300        assert_eq!(
2301            t(r"(?i-u)\d"),
2302            hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2303        );
2304        assert_eq!(
2305            t(r"(?i-u)\s"),
2306            hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2307        );
2308        assert_eq!(
2309            t(r"(?i-u)\w"),
2310            hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2311        );
2312
2313        // ASCII only, negated
2314        assert_eq!(
2315            t_bytes(r"(?-u)\D"),
2316            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2317        );
2318        assert_eq!(
2319            t_bytes(r"(?-u)\S"),
2320            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2321        );
2322        assert_eq!(
2323            t_bytes(r"(?-u)\W"),
2324            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2325        );
2326        assert_eq!(
2327            t_bytes(r"(?i-u)\D"),
2328            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2329        );
2330        assert_eq!(
2331            t_bytes(r"(?i-u)\S"),
2332            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2333        );
2334        assert_eq!(
2335            t_bytes(r"(?i-u)\W"),
2336            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2337        );
2338
2339        // ASCII only, negated, with UTF-8 mode enabled.
2340        // In this case, negating any Perl class results in an error because
2341        // all such classes can match invalid UTF-8.
2342        assert_eq!(
2343            t_err(r"(?-u)\D"),
2344            TestError {
2345                kind: hir::ErrorKind::InvalidUtf8,
2346                span: Span::new(
2347                    Position::new(5, 1, 6),
2348                    Position::new(7, 1, 8),
2349                ),
2350            },
2351        );
2352        assert_eq!(
2353            t_err(r"(?-u)\S"),
2354            TestError {
2355                kind: hir::ErrorKind::InvalidUtf8,
2356                span: Span::new(
2357                    Position::new(5, 1, 6),
2358                    Position::new(7, 1, 8),
2359                ),
2360            },
2361        );
2362        assert_eq!(
2363            t_err(r"(?-u)\W"),
2364            TestError {
2365                kind: hir::ErrorKind::InvalidUtf8,
2366                span: Span::new(
2367                    Position::new(5, 1, 6),
2368                    Position::new(7, 1, 8),
2369                ),
2370            },
2371        );
2372        assert_eq!(
2373            t_err(r"(?i-u)\D"),
2374            TestError {
2375                kind: hir::ErrorKind::InvalidUtf8,
2376                span: Span::new(
2377                    Position::new(6, 1, 7),
2378                    Position::new(8, 1, 9),
2379                ),
2380            },
2381        );
2382        assert_eq!(
2383            t_err(r"(?i-u)\S"),
2384            TestError {
2385                kind: hir::ErrorKind::InvalidUtf8,
2386                span: Span::new(
2387                    Position::new(6, 1, 7),
2388                    Position::new(8, 1, 9),
2389                ),
2390            },
2391        );
2392        assert_eq!(
2393            t_err(r"(?i-u)\W"),
2394            TestError {
2395                kind: hir::ErrorKind::InvalidUtf8,
2396                span: Span::new(
2397                    Position::new(6, 1, 7),
2398                    Position::new(8, 1, 9),
2399                ),
2400            },
2401        );
2402    }
2403
2404    #[test]
2405    #[cfg(not(feature = "unicode-perl"))]
2406    fn class_perl_word_disabled() {
2407        assert_eq!(
2408            t_err(r"\w"),
2409            TestError {
2410                kind: hir::ErrorKind::UnicodePerlClassNotFound,
2411                span: Span::new(
2412                    Position::new(0, 1, 1),
2413                    Position::new(2, 1, 3)
2414                ),
2415            }
2416        );
2417    }
2418
2419    #[test]
2420    #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
2421    fn class_perl_space_disabled() {
2422        assert_eq!(
2423            t_err(r"\s"),
2424            TestError {
2425                kind: hir::ErrorKind::UnicodePerlClassNotFound,
2426                span: Span::new(
2427                    Position::new(0, 1, 1),
2428                    Position::new(2, 1, 3)
2429                ),
2430            }
2431        );
2432    }
2433
2434    #[test]
2435    #[cfg(all(
2436        not(feature = "unicode-perl"),
2437        not(feature = "unicode-gencat")
2438    ))]
2439    fn class_perl_digit_disabled() {
2440        assert_eq!(
2441            t_err(r"\d"),
2442            TestError {
2443                kind: hir::ErrorKind::UnicodePerlClassNotFound,
2444                span: Span::new(
2445                    Position::new(0, 1, 1),
2446                    Position::new(2, 1, 3)
2447                ),
2448            }
2449        );
2450    }
2451
2452    #[test]
2453    #[cfg(feature = "unicode-gencat")]
2454    fn class_unicode_gencat() {
2455        assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2456        assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2457        assert_eq!(
2458            t(r"\p{Separator}"),
2459            hir_uclass_query(ClassQuery::Binary("Z"))
2460        );
2461        assert_eq!(
2462            t(r"\p{se      PaRa ToR}"),
2463            hir_uclass_query(ClassQuery::Binary("Z"))
2464        );
2465        assert_eq!(
2466            t(r"\p{gc:Separator}"),
2467            hir_uclass_query(ClassQuery::Binary("Z"))
2468        );
2469        assert_eq!(
2470            t(r"\p{gc=Separator}"),
2471            hir_uclass_query(ClassQuery::Binary("Z"))
2472        );
2473        assert_eq!(
2474            t(r"\p{gc!=Separator}"),
2475            hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2476        );
2477        assert_eq!(
2478            t(r"\p{Other}"),
2479            hir_uclass_query(ClassQuery::Binary("Other"))
2480        );
2481        assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2482
2483        assert_eq!(
2484            t(r"\PZ"),
2485            hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2486        );
2487        assert_eq!(
2488            t(r"\P{separator}"),
2489            hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2490        );
2491        assert_eq!(
2492            t(r"\P{gc!=separator}"),
2493            hir_uclass_query(ClassQuery::Binary("Z"))
2494        );
2495
2496        assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2497        assert_eq!(
2498            t(r"\p{assigned}"),
2499            hir_uclass_query(ClassQuery::Binary("Assigned"))
2500        );
2501        assert_eq!(
2502            t(r"\p{ascii}"),
2503            hir_uclass_query(ClassQuery::Binary("ASCII"))
2504        );
2505        assert_eq!(
2506            t(r"\p{gc:any}"),
2507            hir_uclass_query(ClassQuery::Binary("Any"))
2508        );
2509        assert_eq!(
2510            t(r"\p{gc:assigned}"),
2511            hir_uclass_query(ClassQuery::Binary("Assigned"))
2512        );
2513        assert_eq!(
2514            t(r"\p{gc:ascii}"),
2515            hir_uclass_query(ClassQuery::Binary("ASCII"))
2516        );
2517
2518        assert_eq!(
2519            t_err(r"(?-u)\pZ"),
2520            TestError {
2521                kind: hir::ErrorKind::UnicodeNotAllowed,
2522                span: Span::new(
2523                    Position::new(5, 1, 6),
2524                    Position::new(8, 1, 9)
2525                ),
2526            }
2527        );
2528        assert_eq!(
2529            t_err(r"(?-u)\p{Separator}"),
2530            TestError {
2531                kind: hir::ErrorKind::UnicodeNotAllowed,
2532                span: Span::new(
2533                    Position::new(5, 1, 6),
2534                    Position::new(18, 1, 19)
2535                ),
2536            }
2537        );
2538        assert_eq!(
2539            t_err(r"\pE"),
2540            TestError {
2541                kind: hir::ErrorKind::UnicodePropertyNotFound,
2542                span: Span::new(
2543                    Position::new(0, 1, 1),
2544                    Position::new(3, 1, 4)
2545                ),
2546            }
2547        );
2548        assert_eq!(
2549            t_err(r"\p{Foo}"),
2550            TestError {
2551                kind: hir::ErrorKind::UnicodePropertyNotFound,
2552                span: Span::new(
2553                    Position::new(0, 1, 1),
2554                    Position::new(7, 1, 8)
2555                ),
2556            }
2557        );
2558        assert_eq!(
2559            t_err(r"\p{gc:Foo}"),
2560            TestError {
2561                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2562                span: Span::new(
2563                    Position::new(0, 1, 1),
2564                    Position::new(10, 1, 11)
2565                ),
2566            }
2567        );
2568    }
2569
2570    #[test]
2571    #[cfg(not(feature = "unicode-gencat"))]
2572    fn class_unicode_gencat_disabled() {
2573        assert_eq!(
2574            t_err(r"\p{Separator}"),
2575            TestError {
2576                kind: hir::ErrorKind::UnicodePropertyNotFound,
2577                span: Span::new(
2578                    Position::new(0, 1, 1),
2579                    Position::new(13, 1, 14)
2580                ),
2581            }
2582        );
2583
2584        assert_eq!(
2585            t_err(r"\p{Any}"),
2586            TestError {
2587                kind: hir::ErrorKind::UnicodePropertyNotFound,
2588                span: Span::new(
2589                    Position::new(0, 1, 1),
2590                    Position::new(7, 1, 8)
2591                ),
2592            }
2593        );
2594    }
2595
2596    #[test]
2597    #[cfg(feature = "unicode-script")]
2598    fn class_unicode_script() {
2599        assert_eq!(
2600            t(r"\p{Greek}"),
2601            hir_uclass_query(ClassQuery::Binary("Greek"))
2602        );
2603        #[cfg(feature = "unicode-case")]
2604        assert_eq!(
2605            t(r"(?i)\p{Greek}"),
2606            hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2607        );
2608        #[cfg(feature = "unicode-case")]
2609        assert_eq!(
2610            t(r"(?i)\P{Greek}"),
2611            hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2612                "Greek"
2613            ))))
2614        );
2615
2616        assert_eq!(
2617            t_err(r"\p{sc:Foo}"),
2618            TestError {
2619                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2620                span: Span::new(
2621                    Position::new(0, 1, 1),
2622                    Position::new(10, 1, 11)
2623                ),
2624            }
2625        );
2626        assert_eq!(
2627            t_err(r"\p{scx:Foo}"),
2628            TestError {
2629                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2630                span: Span::new(
2631                    Position::new(0, 1, 1),
2632                    Position::new(11, 1, 12)
2633                ),
2634            }
2635        );
2636    }
2637
2638    #[test]
2639    #[cfg(not(feature = "unicode-script"))]
2640    fn class_unicode_script_disabled() {
2641        assert_eq!(
2642            t_err(r"\p{Greek}"),
2643            TestError {
2644                kind: hir::ErrorKind::UnicodePropertyNotFound,
2645                span: Span::new(
2646                    Position::new(0, 1, 1),
2647                    Position::new(9, 1, 10)
2648                ),
2649            }
2650        );
2651
2652        assert_eq!(
2653            t_err(r"\p{scx:Greek}"),
2654            TestError {
2655                kind: hir::ErrorKind::UnicodePropertyNotFound,
2656                span: Span::new(
2657                    Position::new(0, 1, 1),
2658                    Position::new(13, 1, 14)
2659                ),
2660            }
2661        );
2662    }
2663
2664    #[test]
2665    #[cfg(feature = "unicode-age")]
2666    fn class_unicode_age() {
2667        assert_eq!(
2668            t_err(r"\p{age:Foo}"),
2669            TestError {
2670                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2671                span: Span::new(
2672                    Position::new(0, 1, 1),
2673                    Position::new(11, 1, 12)
2674                ),
2675            }
2676        );
2677    }
2678
2679    #[test]
2680    #[cfg(feature = "unicode-gencat")]
2681    fn class_unicode_any_empty() {
2682        assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
2683    }
2684
2685    #[test]
2686    #[cfg(not(feature = "unicode-age"))]
2687    fn class_unicode_age_disabled() {
2688        assert_eq!(
2689            t_err(r"\p{age:3.0}"),
2690            TestError {
2691                kind: hir::ErrorKind::UnicodePropertyNotFound,
2692                span: Span::new(
2693                    Position::new(0, 1, 1),
2694                    Position::new(11, 1, 12)
2695                ),
2696            }
2697        );
2698    }
2699
2700    #[test]
2701    fn class_bracketed() {
2702        assert_eq!(t("[a]"), hir_lit("a"));
2703        assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2704        assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
2705        assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2706        assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2707        assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2708        assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
2709        assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
2710        assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
2711        #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2712        assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2713        #[cfg(feature = "unicode-gencat")]
2714        assert_eq!(
2715            t(r"[\pZ]"),
2716            hir_uclass_query(ClassQuery::Binary("separator"))
2717        );
2718        #[cfg(feature = "unicode-gencat")]
2719        assert_eq!(
2720            t(r"[\p{separator}]"),
2721            hir_uclass_query(ClassQuery::Binary("separator"))
2722        );
2723        #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2724        assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2725        #[cfg(feature = "unicode-gencat")]
2726        assert_eq!(
2727            t(r"[^\PZ]"),
2728            hir_uclass_query(ClassQuery::Binary("separator"))
2729        );
2730        #[cfg(feature = "unicode-gencat")]
2731        assert_eq!(
2732            t(r"[^\P{separator}]"),
2733            hir_uclass_query(ClassQuery::Binary("separator"))
2734        );
2735        #[cfg(all(
2736            feature = "unicode-case",
2737            any(feature = "unicode-perl", feature = "unicode-gencat")
2738        ))]
2739        assert_eq!(
2740            t(r"(?i)[^\D]"),
2741            hir_uclass_query(ClassQuery::Binary("digit"))
2742        );
2743        #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2744        assert_eq!(
2745            t(r"(?i)[^\P{greek}]"),
2746            hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2747        );
2748
2749        assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2750        assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2751        assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2752
2753        #[cfg(feature = "unicode-case")]
2754        assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2755        #[cfg(feature = "unicode-case")]
2756        assert_eq!(
2757            t("(?i)[k]"),
2758            hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
2759        );
2760        #[cfg(feature = "unicode-case")]
2761        assert_eq!(
2762            t("(?i)[β]"),
2763            hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2764        );
2765        assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2766
2767        assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2768        assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
2769        assert_eq!(
2770            t_bytes("(?-u)[^a]"),
2771            class_negate(bclass(&[(b'a', b'a')]))
2772        );
2773        #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2774        assert_eq!(
2775            t(r"[^\d]"),
2776            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2777        );
2778        #[cfg(feature = "unicode-gencat")]
2779        assert_eq!(
2780            t(r"[^\pZ]"),
2781            hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2782        );
2783        #[cfg(feature = "unicode-gencat")]
2784        assert_eq!(
2785            t(r"[^\p{separator}]"),
2786            hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2787        );
2788        #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2789        assert_eq!(
2790            t(r"(?i)[^\p{greek}]"),
2791            hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2792                "greek"
2793            ))))
2794        );
2795        #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2796        assert_eq!(
2797            t(r"(?i)[\P{greek}]"),
2798            hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2799                "greek"
2800            ))))
2801        );
2802
2803        // Test some weird cases.
2804        assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2805
2806        assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2807        assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2808        assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2809        assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2810        assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2811
2812        assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2813        assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2814        assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2815        assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2816        assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2817
2818        assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2819        assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2820        assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2821        assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2822        assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2823
2824        assert_eq!(
2825            t_err("(?-u)[^a]"),
2826            TestError {
2827                kind: hir::ErrorKind::InvalidUtf8,
2828                span: Span::new(
2829                    Position::new(5, 1, 6),
2830                    Position::new(9, 1, 10)
2831                ),
2832            }
2833        );
2834        #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2835        assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
2836        #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2837        assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
2838    }
2839
2840    #[test]
2841    fn class_bracketed_union() {
2842        assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2843        #[cfg(feature = "unicode-gencat")]
2844        assert_eq!(
2845            t(r"[a\pZb]"),
2846            hir_union(
2847                hir_uclass(&[('a', 'b')]),
2848                hir_uclass_query(ClassQuery::Binary("separator"))
2849            )
2850        );
2851        #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2852        assert_eq!(
2853            t(r"[\pZ\p{Greek}]"),
2854            hir_union(
2855                hir_uclass_query(ClassQuery::Binary("greek")),
2856                hir_uclass_query(ClassQuery::Binary("separator"))
2857            )
2858        );
2859        #[cfg(all(
2860            feature = "unicode-age",
2861            feature = "unicode-gencat",
2862            feature = "unicode-script"
2863        ))]
2864        assert_eq!(
2865            t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2866            hir_union(
2867                hir_uclass_query(ClassQuery::ByValue {
2868                    property_name: "age",
2869                    property_value: "3.0",
2870                }),
2871                hir_union(
2872                    hir_uclass_query(ClassQuery::Binary("greek")),
2873                    hir_uclass_query(ClassQuery::Binary("separator"))
2874                )
2875            )
2876        );
2877        #[cfg(all(
2878            feature = "unicode-age",
2879            feature = "unicode-gencat",
2880            feature = "unicode-script"
2881        ))]
2882        assert_eq!(
2883            t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2884            hir_union(
2885                hir_uclass_query(ClassQuery::ByValue {
2886                    property_name: "age",
2887                    property_value: "3.0",
2888                }),
2889                hir_union(
2890                    hir_uclass_query(ClassQuery::Binary("cyrillic")),
2891                    hir_union(
2892                        hir_uclass_query(ClassQuery::Binary("greek")),
2893                        hir_uclass_query(ClassQuery::Binary("separator"))
2894                    )
2895                )
2896            )
2897        );
2898
2899        #[cfg(all(
2900            feature = "unicode-age",
2901            feature = "unicode-case",
2902            feature = "unicode-gencat",
2903            feature = "unicode-script"
2904        ))]
2905        assert_eq!(
2906            t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2907            hir_case_fold(hir_union(
2908                hir_uclass_query(ClassQuery::ByValue {
2909                    property_name: "age",
2910                    property_value: "3.0",
2911                }),
2912                hir_union(
2913                    hir_uclass_query(ClassQuery::Binary("greek")),
2914                    hir_uclass_query(ClassQuery::Binary("separator"))
2915                )
2916            ))
2917        );
2918        #[cfg(all(
2919            feature = "unicode-age",
2920            feature = "unicode-gencat",
2921            feature = "unicode-script"
2922        ))]
2923        assert_eq!(
2924            t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2925            hir_negate(hir_union(
2926                hir_uclass_query(ClassQuery::ByValue {
2927                    property_name: "age",
2928                    property_value: "3.0",
2929                }),
2930                hir_union(
2931                    hir_uclass_query(ClassQuery::Binary("greek")),
2932                    hir_uclass_query(ClassQuery::Binary("separator"))
2933                )
2934            ))
2935        );
2936        #[cfg(all(
2937            feature = "unicode-age",
2938            feature = "unicode-case",
2939            feature = "unicode-gencat",
2940            feature = "unicode-script"
2941        ))]
2942        assert_eq!(
2943            t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2944            hir_negate(hir_case_fold(hir_union(
2945                hir_uclass_query(ClassQuery::ByValue {
2946                    property_name: "age",
2947                    property_value: "3.0",
2948                }),
2949                hir_union(
2950                    hir_uclass_query(ClassQuery::Binary("greek")),
2951                    hir_uclass_query(ClassQuery::Binary("separator"))
2952                )
2953            )))
2954        );
2955    }
2956
2957    #[test]
2958    fn class_bracketed_nested() {
2959        assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2960        assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2961        assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
2962
2963        assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2964        assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2965
2966        #[cfg(feature = "unicode-case")]
2967        assert_eq!(
2968            t(r"(?i)[a[^c]]"),
2969            hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2970        );
2971        #[cfg(feature = "unicode-case")]
2972        assert_eq!(
2973            t(r"(?i)[a-b[^c]]"),
2974            hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2975        );
2976
2977        #[cfg(feature = "unicode-case")]
2978        assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2979        #[cfg(feature = "unicode-case")]
2980        assert_eq!(
2981            t(r"(?i)[^a-b[^c]]"),
2982            hir_uclass(&[('C', 'C'), ('c', 'c')])
2983        );
2984
2985        assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
2986        #[cfg(feature = "unicode-case")]
2987        assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
2988    }
2989
2990    #[test]
2991    fn class_bracketed_intersect() {
2992        assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2993        assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2994        assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2995        assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2996        assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2997        assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2998        assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2999        assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
3000        assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3001
3002        assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
3003        assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3004        assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3005        assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
3006        assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
3007        assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
3008
3009        #[cfg(feature = "unicode-case")]
3010        assert_eq!(
3011            t("(?i)[abc&&b-c]"),
3012            hir_case_fold(hir_uclass(&[('b', 'c')]))
3013        );
3014        #[cfg(feature = "unicode-case")]
3015        assert_eq!(
3016            t("(?i)[abc&&[b-c]]"),
3017            hir_case_fold(hir_uclass(&[('b', 'c')]))
3018        );
3019        #[cfg(feature = "unicode-case")]
3020        assert_eq!(
3021            t("(?i)[[abc]&&[b-c]]"),
3022            hir_case_fold(hir_uclass(&[('b', 'c')]))
3023        );
3024        #[cfg(feature = "unicode-case")]
3025        assert_eq!(
3026            t("(?i)[a-z&&b-y&&c-x]"),
3027            hir_case_fold(hir_uclass(&[('c', 'x')]))
3028        );
3029        #[cfg(feature = "unicode-case")]
3030        assert_eq!(
3031            t("(?i)[c-da-b&&a-d]"),
3032            hir_case_fold(hir_uclass(&[('a', 'd')]))
3033        );
3034        #[cfg(feature = "unicode-case")]
3035        assert_eq!(
3036            t("(?i)[a-d&&c-da-b]"),
3037            hir_case_fold(hir_uclass(&[('a', 'd')]))
3038        );
3039
3040        assert_eq!(
3041            t("(?i-u)[abc&&b-c]"),
3042            hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3043        );
3044        assert_eq!(
3045            t("(?i-u)[abc&&[b-c]]"),
3046            hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3047        );
3048        assert_eq!(
3049            t("(?i-u)[[abc]&&[b-c]]"),
3050            hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3051        );
3052        assert_eq!(
3053            t("(?i-u)[a-z&&b-y&&c-x]"),
3054            hir_case_fold(hir_bclass(&[(b'c', b'x')]))
3055        );
3056        assert_eq!(
3057            t("(?i-u)[c-da-b&&a-d]"),
3058            hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3059        );
3060        assert_eq!(
3061            t("(?i-u)[a-d&&c-da-b]"),
3062            hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3063        );
3064
3065        // In `[a^]`, `^` does not need to be escaped, so it makes sense that
3066        // `^` is also allowed to be unescaped after `&&`.
3067        assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
3068        // `]` needs to be escaped after `&&` since it's not at start of class.
3069        assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
3070        assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
3071        assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
3072        assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
3073        // Test precedence.
3074        assert_eq!(
3075            t(r"[a-w&&[^c-g]z]"),
3076            hir_uclass(&[('a', 'b'), ('h', 'w')])
3077        );
3078    }
3079
3080    #[test]
3081    fn class_bracketed_intersect_negate() {
3082        #[cfg(feature = "unicode-perl")]
3083        assert_eq!(
3084            t(r"[^\w&&\d]"),
3085            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3086        );
3087        assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3088        #[cfg(feature = "unicode-perl")]
3089        assert_eq!(
3090            t(r"[^[\w&&\d]]"),
3091            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3092        );
3093        #[cfg(feature = "unicode-perl")]
3094        assert_eq!(
3095            t(r"[^[^\w&&\d]]"),
3096            hir_uclass_query(ClassQuery::Binary("digit"))
3097        );
3098        #[cfg(feature = "unicode-perl")]
3099        assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
3100
3101        #[cfg(feature = "unicode-perl")]
3102        assert_eq!(
3103            t_bytes(r"(?-u)[^\w&&\d]"),
3104            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3105        );
3106        assert_eq!(
3107            t_bytes(r"(?-u)[^[a-z&&a-c]]"),
3108            hir_negate(hir_bclass(&[(b'a', b'c')]))
3109        );
3110        assert_eq!(
3111            t_bytes(r"(?-u)[^[\w&&\d]]"),
3112            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3113        );
3114        assert_eq!(
3115            t_bytes(r"(?-u)[^[^\w&&\d]]"),
3116            hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
3117        );
3118        assert_eq!(
3119            t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
3120            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
3121        );
3122    }
3123
3124    #[test]
3125    fn class_bracketed_difference() {
3126        #[cfg(feature = "unicode-gencat")]
3127        assert_eq!(
3128            t(r"[\pL--[:ascii:]]"),
3129            hir_difference(
3130                hir_uclass_query(ClassQuery::Binary("letter")),
3131                hir_uclass(&[('\0', '\x7F')])
3132            )
3133        );
3134
3135        assert_eq!(
3136            t(r"(?-u)[[:alpha:]--[:lower:]]"),
3137            hir_bclass(&[(b'A', b'Z')])
3138        );
3139    }
3140
3141    #[test]
3142    fn class_bracketed_symmetric_difference() {
3143        #[cfg(feature = "unicode-script")]
3144        assert_eq!(
3145            t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
3146            // Class({
3147            //     '·'..='·',
3148            //     '\u{300}'..='\u{301}',
3149            //     '\u{304}'..='\u{304}',
3150            //     '\u{306}'..='\u{306}',
3151            //     '\u{308}'..='\u{308}',
3152            //     '\u{313}'..='\u{313}',
3153            //     '\u{342}'..='\u{342}',
3154            //     '\u{345}'..='\u{345}',
3155            //     'ʹ'..='ʹ',
3156            //     '\u{1dc0}'..='\u{1dc1}',
3157            //     '⁝'..='⁝',
3158            // })
3159            hir_uclass(&[
3160                ('·', '·'),
3161                ('\u{0300}', '\u{0301}'),
3162                ('\u{0304}', '\u{0304}'),
3163                ('\u{0306}', '\u{0306}'),
3164                ('\u{0308}', '\u{0308}'),
3165                ('\u{0313}', '\u{0313}'),
3166                ('\u{0342}', '\u{0342}'),
3167                ('\u{0345}', '\u{0345}'),
3168                ('ʹ', 'ʹ'),
3169                ('\u{1DC0}', '\u{1DC1}'),
3170                ('⁝', '⁝'),
3171            ])
3172        );
3173        assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
3174
3175        assert_eq!(
3176            t(r"(?-u)[a-g~~c-j]"),
3177            hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
3178        );
3179    }
3180
3181    #[test]
3182    fn ignore_whitespace() {
3183        assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
3184        assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
3185        assert_eq!(
3186            t(r"(?x)\x # comment
3187{ # comment
3188    53 # comment
3189} #comment"),
3190            hir_lit("S")
3191        );
3192
3193        assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
3194        assert_eq!(
3195            t(r"(?x)\x # comment
3196        53 # comment"),
3197            hir_lit("S")
3198        );
3199        assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
3200
3201        #[cfg(feature = "unicode-gencat")]
3202        assert_eq!(
3203            t(r"(?x)\p # comment
3204{ # comment
3205    Separator # comment
3206} # comment"),
3207            hir_uclass_query(ClassQuery::Binary("separator"))
3208        );
3209
3210        assert_eq!(
3211            t(r"(?x)a # comment
3212{ # comment
3213    5 # comment
3214    , # comment
3215    10 # comment
3216} # comment"),
3217            hir_range(true, 5, Some(10), hir_lit("a"))
3218        );
3219
3220        assert_eq!(t(r"(?x)a\  # hi there"), hir_lit("a "));
3221    }
3222
3223    #[test]
3224    fn analysis_is_utf8() {
3225        // Positive examples.
3226        assert!(props_bytes(r"a").is_utf8());
3227        assert!(props_bytes(r"ab").is_utf8());
3228        assert!(props_bytes(r"(?-u)a").is_utf8());
3229        assert!(props_bytes(r"(?-u)ab").is_utf8());
3230        assert!(props_bytes(r"\xFF").is_utf8());
3231        assert!(props_bytes(r"\xFF\xFF").is_utf8());
3232        assert!(props_bytes(r"[^a]").is_utf8());
3233        assert!(props_bytes(r"[^a][^a]").is_utf8());
3234        assert!(props_bytes(r"\b").is_utf8());
3235        assert!(props_bytes(r"\B").is_utf8());
3236        assert!(props_bytes(r"(?-u)\b").is_utf8());
3237        assert!(props_bytes(r"(?-u)\B").is_utf8());
3238
3239        // Negative examples.
3240        assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
3241        assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
3242        assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
3243        assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
3244    }
3245
3246    #[test]
3247    fn analysis_captures_len() {
3248        assert_eq!(0, props(r"a").explicit_captures_len());
3249        assert_eq!(0, props(r"(?:a)").explicit_captures_len());
3250        assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
3251        assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
3252        assert_eq!(1, props(r"(a)").explicit_captures_len());
3253        assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
3254        assert_eq!(1, props(r"()").explicit_captures_len());
3255        assert_eq!(1, props(r"()a").explicit_captures_len());
3256        assert_eq!(1, props(r"(a)+").explicit_captures_len());
3257        assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
3258        assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
3259        assert_eq!(2, props(r"((a))").explicit_captures_len());
3260        assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
3261    }
3262
3263    #[test]
3264    fn analysis_static_captures_len() {
3265        let len = |pattern| props(pattern).static_explicit_captures_len();
3266        assert_eq!(Some(0), len(r""));
3267        assert_eq!(Some(0), len(r"foo|bar"));
3268        assert_eq!(None, len(r"(foo)|bar"));
3269        assert_eq!(None, len(r"foo|(bar)"));
3270        assert_eq!(Some(1), len(r"(foo|bar)"));
3271        assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
3272        assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
3273        assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
3274        assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
3275        assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
3276        assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
3277        assert_eq!(None, len(r"(a)(b)(extra)?"));
3278        assert_eq!(Some(1), len(r"(foo)|(bar)"));
3279        assert_eq!(Some(2), len(r"(foo)(bar)"));
3280        assert_eq!(Some(2), len(r"(foo)+(bar)"));
3281        assert_eq!(None, len(r"(foo)*(bar)"));
3282        assert_eq!(Some(0), len(r"(foo)?{0}"));
3283        assert_eq!(None, len(r"(foo)?{1}"));
3284        assert_eq!(Some(1), len(r"(foo){1}"));
3285        assert_eq!(Some(1), len(r"(foo){1,}"));
3286        assert_eq!(Some(1), len(r"(foo){1,}?"));
3287        assert_eq!(None, len(r"(foo){1,}??"));
3288        assert_eq!(None, len(r"(foo){0,}"));
3289        assert_eq!(Some(1), len(r"(foo)(?:bar)"));
3290        assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
3291        assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
3292        assert_eq!(
3293            Some(2),
3294            len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
3295        );
3296    }
3297
3298    #[test]
3299    fn analysis_is_all_assertions() {
3300        // Positive examples.
3301        let p = props(r"\b");
3302        assert!(!p.look_set().is_empty());
3303        assert_eq!(p.minimum_len(), Some(0));
3304
3305        let p = props(r"\B");
3306        assert!(!p.look_set().is_empty());
3307        assert_eq!(p.minimum_len(), Some(0));
3308
3309        let p = props(r"^");
3310        assert!(!p.look_set().is_empty());
3311        assert_eq!(p.minimum_len(), Some(0));
3312
3313        let p = props(r"$");
3314        assert!(!p.look_set().is_empty());
3315        assert_eq!(p.minimum_len(), Some(0));
3316
3317        let p = props(r"\A");
3318        assert!(!p.look_set().is_empty());
3319        assert_eq!(p.minimum_len(), Some(0));
3320
3321        let p = props(r"\z");
3322        assert!(!p.look_set().is_empty());
3323        assert_eq!(p.minimum_len(), Some(0));
3324
3325        let p = props(r"$^\z\A\b\B");
3326        assert!(!p.look_set().is_empty());
3327        assert_eq!(p.minimum_len(), Some(0));
3328
3329        let p = props(r"$|^|\z|\A|\b|\B");
3330        assert!(!p.look_set().is_empty());
3331        assert_eq!(p.minimum_len(), Some(0));
3332
3333        let p = props(r"^$|$^");
3334        assert!(!p.look_set().is_empty());
3335        assert_eq!(p.minimum_len(), Some(0));
3336
3337        let p = props(r"((\b)+())*^");
3338        assert!(!p.look_set().is_empty());
3339        assert_eq!(p.minimum_len(), Some(0));
3340
3341        // Negative examples.
3342        let p = props(r"^a");
3343        assert!(!p.look_set().is_empty());
3344        assert_eq!(p.minimum_len(), Some(1));
3345    }
3346
3347    #[test]
3348    fn analysis_look_set_prefix_any() {
3349        let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
3350        assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3351    }
3352
3353    #[test]
3354    fn analysis_is_anchored() {
3355        let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
3356        let is_end = |p| props(p).look_set_suffix().contains(Look::End);
3357
3358        // Positive examples.
3359        assert!(is_start(r"^"));
3360        assert!(is_end(r"$"));
3361
3362        assert!(is_start(r"^^"));
3363        assert!(props(r"$$").look_set_suffix().contains(Look::End));
3364
3365        assert!(is_start(r"^$"));
3366        assert!(is_end(r"^$"));
3367
3368        assert!(is_start(r"^foo"));
3369        assert!(is_end(r"foo$"));
3370
3371        assert!(is_start(r"^foo|^bar"));
3372        assert!(is_end(r"foo$|bar$"));
3373
3374        assert!(is_start(r"^(foo|bar)"));
3375        assert!(is_end(r"(foo|bar)$"));
3376
3377        assert!(is_start(r"^+"));
3378        assert!(is_end(r"$+"));
3379        assert!(is_start(r"^++"));
3380        assert!(is_end(r"$++"));
3381        assert!(is_start(r"(^)+"));
3382        assert!(is_end(r"($)+"));
3383
3384        assert!(is_start(r"$^"));
3385        assert!(is_start(r"$^"));
3386        assert!(is_start(r"$^|^$"));
3387        assert!(is_end(r"$^|^$"));
3388
3389        assert!(is_start(r"\b^"));
3390        assert!(is_end(r"$\b"));
3391        assert!(is_start(r"^(?m:^)"));
3392        assert!(is_end(r"(?m:$)$"));
3393        assert!(is_start(r"(?m:^)^"));
3394        assert!(is_end(r"$(?m:$)"));
3395
3396        // Negative examples.
3397        assert!(!is_start(r"(?m)^"));
3398        assert!(!is_end(r"(?m)$"));
3399        assert!(!is_start(r"(?m:^$)|$^"));
3400        assert!(!is_end(r"(?m:^$)|$^"));
3401        assert!(!is_start(r"$^|(?m:^$)"));
3402        assert!(!is_end(r"$^|(?m:^$)"));
3403
3404        assert!(!is_start(r"a^"));
3405        assert!(!is_start(r"$a"));
3406
3407        assert!(!is_end(r"a^"));
3408        assert!(!is_end(r"$a"));
3409
3410        assert!(!is_start(r"^foo|bar"));
3411        assert!(!is_end(r"foo|bar$"));
3412
3413        assert!(!is_start(r"^*"));
3414        assert!(!is_end(r"$*"));
3415        assert!(!is_start(r"^*+"));
3416        assert!(!is_end(r"$*+"));
3417        assert!(!is_start(r"^+*"));
3418        assert!(!is_end(r"$+*"));
3419        assert!(!is_start(r"(^)*"));
3420        assert!(!is_end(r"($)*"));
3421    }
3422
3423    #[test]
3424    fn analysis_is_any_anchored() {
3425        let is_start = |p| props(p).look_set().contains(Look::Start);
3426        let is_end = |p| props(p).look_set().contains(Look::End);
3427
3428        // Positive examples.
3429        assert!(is_start(r"^"));
3430        assert!(is_end(r"$"));
3431        assert!(is_start(r"\A"));
3432        assert!(is_end(r"\z"));
3433
3434        // Negative examples.
3435        assert!(!is_start(r"(?m)^"));
3436        assert!(!is_end(r"(?m)$"));
3437        assert!(!is_start(r"$"));
3438        assert!(!is_end(r"^"));
3439    }
3440
3441    #[test]
3442    fn analysis_can_empty() {
3443        // Positive examples.
3444        let assert_empty =
3445            |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
3446        assert_empty(r"");
3447        assert_empty(r"()");
3448        assert_empty(r"()*");
3449        assert_empty(r"()+");
3450        assert_empty(r"()?");
3451        assert_empty(r"a*");
3452        assert_empty(r"a?");
3453        assert_empty(r"a{0}");
3454        assert_empty(r"a{0,}");
3455        assert_empty(r"a{0,1}");
3456        assert_empty(r"a{0,10}");
3457        #[cfg(feature = "unicode-gencat")]
3458        assert_empty(r"\pL*");
3459        assert_empty(r"a*|b");
3460        assert_empty(r"b|a*");
3461        assert_empty(r"a|");
3462        assert_empty(r"|a");
3463        assert_empty(r"a||b");
3464        assert_empty(r"a*a?(abcd)*");
3465        assert_empty(r"^");
3466        assert_empty(r"$");
3467        assert_empty(r"(?m)^");
3468        assert_empty(r"(?m)$");
3469        assert_empty(r"\A");
3470        assert_empty(r"\z");
3471        assert_empty(r"\B");
3472        assert_empty(r"(?-u)\B");
3473        assert_empty(r"\b");
3474        assert_empty(r"(?-u)\b");
3475
3476        // Negative examples.
3477        let assert_non_empty =
3478            |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
3479        assert_non_empty(r"a+");
3480        assert_non_empty(r"a{1}");
3481        assert_non_empty(r"a{1,}");
3482        assert_non_empty(r"a{1,2}");
3483        assert_non_empty(r"a{1,10}");
3484        assert_non_empty(r"b|a");
3485        assert_non_empty(r"a*a+(abcd)*");
3486        #[cfg(feature = "unicode-gencat")]
3487        assert_non_empty(r"\P{any}");
3488        assert_non_empty(r"[a--a]");
3489        assert_non_empty(r"[a&&b]");
3490    }
3491
3492    #[test]
3493    fn analysis_is_literal() {
3494        // Positive examples.
3495        assert!(props(r"a").is_literal());
3496        assert!(props(r"ab").is_literal());
3497        assert!(props(r"abc").is_literal());
3498        assert!(props(r"(?m)abc").is_literal());
3499        assert!(props(r"(?:a)").is_literal());
3500        assert!(props(r"foo(?:a)").is_literal());
3501        assert!(props(r"(?:a)foo").is_literal());
3502        assert!(props(r"[a]").is_literal());
3503
3504        // Negative examples.
3505        assert!(!props(r"").is_literal());
3506        assert!(!props(r"^").is_literal());
3507        assert!(!props(r"a|b").is_literal());
3508        assert!(!props(r"(a)").is_literal());
3509        assert!(!props(r"a+").is_literal());
3510        assert!(!props(r"foo(a)").is_literal());
3511        assert!(!props(r"(a)foo").is_literal());
3512        assert!(!props(r"[ab]").is_literal());
3513    }
3514
3515    #[test]
3516    fn analysis_is_alternation_literal() {
3517        // Positive examples.
3518        assert!(props(r"a").is_alternation_literal());
3519        assert!(props(r"ab").is_alternation_literal());
3520        assert!(props(r"abc").is_alternation_literal());
3521        assert!(props(r"(?m)abc").is_alternation_literal());
3522        assert!(props(r"foo|bar").is_alternation_literal());
3523        assert!(props(r"foo|bar|baz").is_alternation_literal());
3524        assert!(props(r"[a]").is_alternation_literal());
3525        assert!(props(r"(?:ab)|cd").is_alternation_literal());
3526        assert!(props(r"ab|(?:cd)").is_alternation_literal());
3527
3528        // Negative examples.
3529        assert!(!props(r"").is_alternation_literal());
3530        assert!(!props(r"^").is_alternation_literal());
3531        assert!(!props(r"(a)").is_alternation_literal());
3532        assert!(!props(r"a+").is_alternation_literal());
3533        assert!(!props(r"foo(a)").is_alternation_literal());
3534        assert!(!props(r"(a)foo").is_alternation_literal());
3535        assert!(!props(r"[ab]").is_alternation_literal());
3536        assert!(!props(r"[ab]|b").is_alternation_literal());
3537        assert!(!props(r"a|[ab]").is_alternation_literal());
3538        assert!(!props(r"(a)|b").is_alternation_literal());
3539        assert!(!props(r"a|(b)").is_alternation_literal());
3540        assert!(!props(r"a|b").is_alternation_literal());
3541        assert!(!props(r"a|b|c").is_alternation_literal());
3542        assert!(!props(r"[a]|b").is_alternation_literal());
3543        assert!(!props(r"a|[b]").is_alternation_literal());
3544        assert!(!props(r"(?:a)|b").is_alternation_literal());
3545        assert!(!props(r"a|(?:b)").is_alternation_literal());
3546        assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
3547    }
3548
3549    // This tests that the smart Hir::repetition constructors does some basic
3550    // simplifications.
3551    #[test]
3552    fn smart_repetition() {
3553        assert_eq!(t(r"a{0}"), Hir::empty());
3554        assert_eq!(t(r"a{1}"), hir_lit("a"));
3555        assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
3556    }
3557
3558    // This tests that the smart Hir::concat constructor simplifies the given
3559    // exprs in a way we expect.
3560    #[test]
3561    fn smart_concat() {
3562        assert_eq!(t(""), Hir::empty());
3563        assert_eq!(t("(?:)"), Hir::empty());
3564        assert_eq!(t("abc"), hir_lit("abc"));
3565        assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3566        assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3567        assert_eq!(
3568            t("foo(?:bar^baz)quux"),
3569            hir_cat(vec![
3570                hir_lit("foobar"),
3571                hir_look(hir::Look::Start),
3572                hir_lit("bazquux"),
3573            ])
3574        );
3575        assert_eq!(
3576            t("foo(?:ba(?:r^b)az)quux"),
3577            hir_cat(vec![
3578                hir_lit("foobar"),
3579                hir_look(hir::Look::Start),
3580                hir_lit("bazquux"),
3581            ])
3582        );
3583    }
3584
3585    // This tests that the smart Hir::alternation constructor simplifies the
3586    // given exprs in a way we expect.
3587    #[test]
3588    fn smart_alternation() {
3589        assert_eq!(
3590            t("(?:foo)|(?:bar)"),
3591            hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
3592        );
3593        assert_eq!(
3594            t("quux|(?:abc|def|xyz)|baz"),
3595            hir_alt(vec![
3596                hir_lit("quux"),
3597                hir_lit("abc"),
3598                hir_lit("def"),
3599                hir_lit("xyz"),
3600                hir_lit("baz"),
3601            ])
3602        );
3603        assert_eq!(
3604            t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
3605            hir_alt(vec![
3606                hir_lit("quux"),
3607                hir_lit("abc"),
3608                hir_lit("def"),
3609                hir_lit("mno"),
3610                hir_lit("xyz"),
3611                hir_lit("baz"),
3612            ])
3613        );
3614        assert_eq!(
3615            t("a|b|c|d|e|f|x|y|z"),
3616            hir_uclass(&[('a', 'f'), ('x', 'z')]),
3617        );
3618        // Tests that we lift common prefixes out of an alternation.
3619        assert_eq!(
3620            t("[A-Z]foo|[A-Z]quux"),
3621            hir_cat(vec![
3622                hir_uclass(&[('A', 'Z')]),
3623                hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
3624            ]),
3625        );
3626        assert_eq!(
3627            t("[A-Z][A-Z]|[A-Z]quux"),
3628            hir_cat(vec![
3629                hir_uclass(&[('A', 'Z')]),
3630                hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
3631            ]),
3632        );
3633        assert_eq!(
3634            t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
3635            hir_cat(vec![
3636                hir_uclass(&[('A', 'Z')]),
3637                hir_uclass(&[('A', 'Z')]),
3638                hir_alt(vec![Hir::empty(), hir_lit("quux")]),
3639            ]),
3640        );
3641        assert_eq!(
3642            t("[A-Z]foo|[A-Z]foobar"),
3643            hir_cat(vec![
3644                hir_uclass(&[('A', 'Z')]),
3645                hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
3646            ]),
3647        );
3648    }
3649
3650    #[test]
3651    fn regression_alt_empty_concat() {
3652        use crate::ast::{self, Ast};
3653
3654        let span = Span::splat(Position::new(0, 0, 0));
3655        let ast = Ast::alternation(ast::Alternation {
3656            span,
3657            asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
3658        });
3659
3660        let mut t = Translator::new();
3661        assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
3662    }
3663
3664    #[test]
3665    fn regression_empty_alt() {
3666        use crate::ast::{self, Ast};
3667
3668        let span = Span::splat(Position::new(0, 0, 0));
3669        let ast = Ast::concat(ast::Concat {
3670            span,
3671            asts: vec![Ast::alternation(ast::Alternation {
3672                span,
3673                asts: vec![],
3674            })],
3675        });
3676
3677        let mut t = Translator::new();
3678        assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
3679    }
3680
3681    #[test]
3682    fn regression_singleton_alt() {
3683        use crate::{
3684            ast::{self, Ast},
3685            hir::Dot,
3686        };
3687
3688        let span = Span::splat(Position::new(0, 0, 0));
3689        let ast = Ast::concat(ast::Concat {
3690            span,
3691            asts: vec![Ast::alternation(ast::Alternation {
3692                span,
3693                asts: vec![Ast::dot(span)],
3694            })],
3695        });
3696
3697        let mut t = Translator::new();
3698        assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
3699    }
3700
3701    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
3702    #[test]
3703    fn regression_fuzz_match() {
3704        let pat = "[(\u{6} \0-\u{afdf5}]  \0 ";
3705        let ast = ParserBuilder::new()
3706            .octal(false)
3707            .ignore_whitespace(true)
3708            .build()
3709            .parse(pat)
3710            .unwrap();
3711        let hir = TranslatorBuilder::new()
3712            .utf8(true)
3713            .case_insensitive(false)
3714            .multi_line(false)
3715            .dot_matches_new_line(false)
3716            .swap_greed(true)
3717            .unicode(true)
3718            .build()
3719            .translate(pat, &ast)
3720            .unwrap();
3721        assert_eq!(
3722            hir,
3723            Hir::concat(vec![
3724                hir_uclass(&[('\0', '\u{afdf5}')]),
3725                hir_lit("\0"),
3726            ])
3727        );
3728    }
3729
3730    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
3731    #[cfg(feature = "unicode")]
3732    #[test]
3733    fn regression_fuzz_difference1() {
3734        let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
3735        let _ = t(pat); // shouldn't panic
3736    }
3737
3738    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
3739    #[test]
3740    fn regression_fuzz_char_decrement1() {
3741        let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
3742        let _ = t(pat); // shouldn't panic
3743    }
3744}