# Segmentation rules for LineBreak
#
# Character Classes
#
#
# Variables
#
$AI=\p{Line_Break=Ambiguous}
$AL=\p{Line_Break=Alphabetic}
$B2=\p{Line_Break=Break_Both}
$BA=\p{Line_Break=Break_After}
$BB=\p{Line_Break=Break_Before}
$BK=\p{Line_Break=Mandatory_Break}
$CB=\p{Line_Break=Contingent_Break}
$CL=\p{Line_Break=Close_Punctuation}
$CP=\p{Line_Break=CP}
$CM1=\p{Line_Break=Combining_Mark}
$CR=\p{Line_Break=Carriage_Return}
$EX=\p{Line_Break=Exclamation}
$GL=\p{Line_Break=Glue}
$H2=\p{Line_Break=H2}
$H3=\p{Line_Break=H3}
$HL=\p{Line_Break=HL}
$HY=\p{Line_Break=Hyphen}
$ID=\p{Line_Break=Ideographic}
$IN=\p{Line_Break=Inseparable}
$IS=\p{Line_Break=Infix_Numeric}
$JL=\p{Line_Break=JL}
$JT=\p{Line_Break=JT}
$JV=\p{Line_Break=JV}
$LF=\p{Line_Break=Line_Feed}
$NL=\p{Line_Break=Next_Line}
$NS=\p{Line_Break=Nonstarter}
$NU=\p{Line_Break=Numeric}
$OP=\p{Line_Break=Open_Punctuation}
$PO=\p{Line_Break=Postfix_Numeric}
$PR=\p{Line_Break=Prefix_Numeric}
$QU=\p{Line_Break=Quotation}
$SA=\p{Line_Break=Complex_Context}
$SG=\p{Line_Break=Surrogate}
$SP=\p{Line_Break=Space}
$SY=\p{Line_Break=Break_Symbols}
$WJ=\p{Line_Break=Word_Joiner}
$XX=\p{Line_Break=Unknown}
$ZW=\p{Line_Break=ZWSpace}
$CJ=\p{Line_Break=Conditional_Japanese_Starter}
$RI=\p{Line_Break=Regional_Indicator}
$EB=\p{Line_Break=E_Base}
$EM=\p{Line_Break=E_Modifier}
$ZWJ_O=\p{Line_Break=ZWJ}
$ZWJ=\p{Line_Break=ZWJ}
#
# Macros
#
$CM=[$CM1 $ZWJ]
#
# LB 1 Assign a line breaking class to each code point
of the input.
#
# Resolve AI, CB, SA, SG, and XX into other line
breaking classes depending on criteria outside the scope
of this algorithm.
#
# NOTE: CB is ok to fall through, but must handle others
here.
#
$AL=[$AI $AL $XX $SA $SG]
$NS=[$NS $CJ]
#
# WARNING: Fixes for Rule 9
#
# Treat X (CM|ZWJ* as if it were X.
#
# Where X is any line break class except SP, BK, CR, LF,
NL or ZW.
#
$X=$CM*
#
# Macros
#
$Spec1_=[$SP $BK $CR $LF $NL $ZW]
$Spec2_=[^ $SP $BK $CR $LF $NL $ZW]
$Spec3a_=[^ $SP $BA $HY $CM]
$Spec3b_=[^ $BA $HY $CM]
$Spec4_=[^ $NU $CM]
$AI=($AI $X)
$AL=($AL $X)
$B2=($B2 $X)
$BA=($BA $X)
$BB=($BB $X)
$CB=($CB $X)
$CL=($CL $X)
$CP=($CP $X)
$CM=($CM $X)
$EX=($EX $X)
$GL=($GL $X)
$H2=($H2 $X)
$H3=($H3 $X)
$HL=($HL $X)
$HY=($HY $X)
$ID=($ID $X)
$IN=($IN $X)
$IS=($IS $X)
$JL=($JL $X)
$JT=($JT $X)
$JV=($JV $X)
$NS=($NS $X)
$NU=($NU $X)
$OP=($OP $X)
$PO=($PO $X)
$PR=($PR $X)
$QU=($QU $X)
$SA=($SA $X)
$SG=($SG $X)
$SY=($SY $X)
$WJ=($WJ $X)
$XX=($XX $X)
$RI=($RI $X)
$EB=($EB $X)
$EM=($EM $X)
$ZWJ=($ZWJ $X)
#
# OUT OF ORDER ON PURPOSE
#
# LB 10 Treat any remaining combining mark as AL.
#
$AL=($AL | ^ $CM | (?<=$Spec1_) $CM)
#
# Rules
#
# LB 4 Always break after hard line breaks (but never
between CR and LF).
#
4) $BK ÷
#
# LB 5 Treat CR followed by LF, as well as CR, LF and
NL as hard line breaks.
#
5.01) $CR × $LF
5.02) $CR ÷
5.03) $LF ÷
5.04) $NL ÷
#
# LB 6 Do not break before hard line breaks.
#
6) × ( $BK | $CR | $LF | $NL )
#
# LB 7 Do not break before spaces or zero-width space.
#
7.01) × $SP
7.02) × $ZW
#
# LB 8 Break before any character following a
zero-width space, even if one or more spaces intervene.
#
8) $ZW $SP* ÷
#
# LB 8a Don't break between ZWJ and IDs (for use in
Emoji ZWJ sequences)
#
8.1) $ZWJ_O × ($ID | $EB | $EM)
#
# LB 9 Do not break a combining character sequence;
treat it as if it has the LB class of the base character
#
# in all of the following rules. (Where X is any line
break class except SP, BK, CR, LF, NL or ZW.)
#
9) $Spec2_ × $CM
#
#WARNING: this is done by modifying the variable values
for all but SP.... That is, $AL is really ($AI $CM*)!
#
# LB 11 Do not break before or after WORD JOINER and
related characters.
#
11.01) × $WJ
11.02) $WJ ×
#
# LB 12 Do not break after NBSP and related characters.
#
12) $GL ×
12.1) $Spec3a_ × $GL
12.2) $Spec3b_ $CM+ × $GL
12.3) ^ $CM+ × $GL
#
# LB 13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’,
even after spaces.
#
# Using customization 7.
#
13.01) × $EX
13.02) $Spec4_ × ($CL | $CP | $IS | $SY)
13.03) $Spec4_ $CM+ × ($CL | $CP | $IS | $SY)
13.04) ^ $CM+ × ($CL | $CP | $IS | $SY)
#
#LB 14 Do not break after ‘[’, even after spaces.
#
14) $OP $SP* ×
#
# LB 15 Do not break within ‘"[’, even with intervening
spaces.
#
15) $QU $SP* × $OP
#
# LB 16 Do not break between closing punctuation and a
nonstarter (lb=NS), even with intervening spaces.
#
16) ($CL | $CP) $SP* × $NS
#
# LB 17 Do not break within ‘——’, even with intervening
spaces.
#
17) $B2 $SP* × $B2
#
# LB 18 Break after spaces.
#
18) $SP ÷
#
# LB 19 Do not break before or after ‘"’.
#
19.01) × $QU
19.02) $QU ×
#
# LB 20 Break before and after unresolved CB.
#
20.01) ÷ $CB
20.02) $CB ÷
#
# LB 21 Do not break before hyphen-minus, other
hyphens, fixed-width spaces, small kana and other
non-starters, or after acute accents.
#
21.01) × $BA
21.02) × $HY
21.03) × $NS
21.04) $BB ×
#
# LB 21a Don't break after Hebrew + Hyphen.
#
21.1) $HL ($HY | $BA) ×
#
# LB 21b Don’t break between Solidus and Hebrew letters.
#
21.2) $SY × $HL
#
# LB 22 Do not break between two ellipses, or between
letters, numbers or exclamations and ellipsis.
#
22.01) ($AL | $HL) × $IN
22.02) $EX × $IN
22.03) ($ID | $EB | $EM) × $IN
22.04) $IN × $IN
22.05) $NU × $IN
#
# LB 23 Do not break between digits and letters.
#
23.02) ($AL | $HL) × $NU
23.03) $NU × ($AL | $HL)
#
# LB 24 Do not break between prefix and letters or
ideographs.
#
23.12) $PR × ($ID | $EB | $EM)
23.13) ($ID | $EB | $EM) × $PO
#
# LB24 Do not break between numeric prefix/postfix and
letters, or between letters and prefix/postfix.
#
24.02) ($PR | $PO) × ($AL | $HL)
24.03) ($AL | $HL) × ($PR | $PO)
#
# Using customization 7
#
# LB Alternative: ( PR | PO) ? ( OP | HY ) ? NU (NU | SY
| IS) * (CL | CP) ? ( PR | PO) ?
#
# Insert × every place it could go. However, make sure
that at least one thing is concrete, otherwise would
cause $NU to not break before or after
#
25.01) ($PR | $PO) × ( $OP | $HY )? $NU
25.02) ( $OP | $HY ) × $NU
25.03) $NU × ($NU | $SY | $IS)
25.04) $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL |
$CP)
25.05) $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR)
#
#LB 26 Do not break a Korean syllable.
#
26.01) $JL × $JL | $JV | $H2 | $H3
26.02) $JV | $H2 × $JV | $JT
26.03) $JT | $H3 × $JT
#
# LB 27 Treat a Korean Syllable Block the same as ID.
#
27.01) $JL | $JV | $JT | $H2 | $H3 × $IN
27.02) $JL | $JV | $JT | $H2 | $H3 × $PO
27.03) $PR × $JL | $JV | $JT | $H2 | $H3
#
# LB 28 Do not break between alphabetics ("at").
#
28) ($AL | $HL) × ($AL | $HL)
#
# LB 29 Do not break between numeric punctuation and
alphabetics ("e.g.").
#
29) $IS × ($AL | $HL)
#
# LB 30 Do not break between letters, numbers or
ordinary symbols and opening or closing punctuation.
#
30.01) ($AL | $HL | $NU) × $OP
30.02) $CP × ($AL | $HL | $NU)
#
# LB 30a Break between two Regional Indicators if and
only if there is an even number of them before the point
being considered.
#
30.11) ^ ($RI $RI)* $RI × $RI
30.12) [^$RI] ($RI $RI)* $RI × $RI
30.13) $RI ÷ $RI
30.2) $EB × $EM
# Segmentation
rules for SentenceBreak
#
# Character
Classes
#
$CR=\p{Sentence_Break=CR}
$LF=\p{Sentence_Break=LF}
$Extend=\p{Sentence_Break=Extend}
$Format=\p{Sentence_Break=Format}
$Sep=\p{Sentence_Break=Sep}
$Sp=\p{Sentence_Break=Sp}
$Lower=\p{Sentence_Break=Lower}
$Upper=\p{Sentence_Break=Upper}
$OLetter=\p{Sentence_Break=OLetter}
$Numeric=\p{Sentence_Break=Numeric}
$ATerm=\p{Sentence_Break=ATerm}
$STerm=\p{Sentence_Break=STerm}
$Close=\p{Sentence_Break=Close}
$SContinue=\p{Sentence_Break=SContinue}
$Any=.
#
# Expresses the
negation in rule 8; can't do this with normal regex,
but works with UnicodeSet, which is all we need.
#
# WARNING: For
Rule 5, now add format and extend to everything but
Sep, Format, and Extend
#
$FE=[$Format
$Extend]
#
# Special rules
#
$NotPreLower_=[^
$OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]
$Sp=($Sp $FE*)
$Lower=($Lower
$FE*)
$Upper=($Upper
$FE*)
$OLetter=($OLetter $FE*)
$Numeric=($Numeric $FE*)
$ATerm=($ATerm
$FE*)
$STerm=($STerm
$FE*)
$Close=($Close
$FE*)
$SContinue=($SContinue $FE*)
#
# Macros
#
$ParaSep = ($Sep
| $CR | $LF)
$SATerm =
($STerm | $ATerm)
#
# Rules
#
# Break at the
start and end of text, unless the text is empty.
#
# Do not break
within CRLF.
#
3) $CR
×
$LF
#
# Break after
paragraph separators.
#
4) $ParaSep
÷
#
# Ignore Format
and Extend characters, except after sot, ParaSep,
and within CRLF. (See Section 6.2, Replacing Ignore
Rules.) This also has the effect of: Any × (Format |
Extend)
#
# WARNING:
Implemented as don't break before format (except
after linebreaks),
#
# AND add format
and extend in all variables definitions that appear
after this point!
#
5) × [$Format
$Extend]
#
# Do not break
after full stop in certain contexts. [See note
below.]
#
Do not break
after ambiguous terminators like period, if
immediately followed by a number or lowercase
letter,
#
# is between
uppercase letters, or if the first following letter
(optionally after certain punctuation) is lowercase.
#
# For example, a
period may be an abbreviation or numeric period, and
not mark the end of a sentence.
#
6) $ATerm
×
$Numeric
7) ($Upper |
$Lower) $ATerm
×
$Upper
8) $ATerm
$Close* $Sp*
×
$NotPreLower_* $Lower
8.1) $SATerm
$Close* $Sp*
×
($SContinue | $SATerm)
#
# Break after
sentence terminators, but include closing
punctuation, trailing spaces, and any paragraph
separator. [See note below.] Include closing
punctuation, trailing spaces, and (optionally) a
paragraph separator.
#
9) $SATerm
$Close*
×
( $Close | $Sp | $ParaSep )
#
# Note the fix
to $Sp*, $Sep?
#
10) $SATerm
$Close* $Sp*
×
( $Sp | $ParaSep )
11) $SATerm
$Close* $Sp* $ParaSep? ÷
#
#Otherwise, do
not break
#
12) ×
$Any