# # Copyright (C) 2002-2009, International Business Machines Corporation and others. # All Rights Reserved. # # file: char.txt # # ICU Character Break Rules, also known as Grapheme Cluster Boundaries # See Unicode Standard Annex #29. # These rules are based on TR29 Revision 13, for Unicode Version 5.1 # # # Character Class Definitions. # $CR = [\p{Grapheme_Cluster_Break = CR}]; $LF = [\p{Grapheme_Cluster_Break = LF}]; $Control = [\p{Grapheme_Cluster_Break = Control}]; $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; $Extend = [\p{Grapheme_Cluster_Break = Extend}]; $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; $BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; $BengaliSignVirama = \u09CD; $GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; $GujaratiSignVirama = \u0ACD; $DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; $DevanagariSignVirama = \u094D; $KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; $KannadaSignVirama = \u0CCD; $MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; $MalayalamSignVirama = \u0D4D; $OdiaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; $OdiaSignVirama = \u0B4D; $GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; $GurmukhiSignVirama = \u0A4D; $TamilKa = \u0B95; $TamilSignVirama = \u0BCD; $TamilSsa = \u0BB7; $TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; $TeluguSignVirama = \u0C4D; # # Korean Syllable Definitions # $L = [\p{Grapheme_Cluster_Break = L}]; $V = [\p{Grapheme_Cluster_Break = V}]; $T = [\p{Grapheme_Cluster_Break = T}]; $LV = [\p{Grapheme_Cluster_Break = LV}]; $LVT = [\p{Grapheme_Cluster_Break = LVT}]; ## ------------------------------------------------- !!chain; !!forward; $CR $LF; $BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; $GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; $DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; $KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; $MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; $OdiaLetter ($OdiaSignVirama $OdiaLetter?)+; $GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; $TamilKa $TamilSignVirama $TamilSsa; $TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; $L ($L | $V | $LV | $LVT); ($LV | $V) ($V | $T); ($LVT | $T) $T; [^$Control $CR $LF] $Extend; [^$Control $CR $LF] $SpacingMark; $Prepend [^$Control $CR $LF]; ## ------------------------------------------------- !!reverse; $LF $CR; ($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; ($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; ($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; ($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; ($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; ($OdiaLetter? $OdiaSignVirama)+ $OdiaLetter; ($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; $TamilSsa $TamilSignVirama $TamilKa; ($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; ($L | $V | $LV | $LVT) $L; ($V | $T) ($LV | $V); $T ($LVT | $T); $Extend [^$Control $CR $LF]; $SpacingMark [^$Control $CR $LF]; [^$Control $CR $LF] $Prepend; ## ------------------------------------------------- !!safe_reverse; ## ------------------------------------------------- !!safe_forward;