@@ -577,57 +577,77 @@ string_fragment::byte_to_column_index(const size_t byte_index) const
577577 return curr_col;
578578}
579579
580- static bool
581- iswordbreak (wchar_t wchar)
580+ enum class word_char_class {
581+ space,
582+ word,
583+ symbol,
584+ };
585+
586+ static word_char_class
587+ classify_word_char (wchar_t wchar)
582588{
583- static constexpr uint32_t mask
589+ if (uc_is_property_white_space (wchar)) {
590+ return word_char_class::space;
591+ }
592+ static constexpr uint32_t word_mask
584593 = UC_CATEGORY_MASK_L | UC_CATEGORY_MASK_N | UC_CATEGORY_MASK_Pc;
585- return !uc_is_general_category_withtable (wchar, mask);
594+ if (uc_is_general_category_withtable (wchar, word_mask)) {
595+ return word_char_class::word;
596+ }
597+ return word_char_class::symbol;
598+ }
599+
600+ static bool
601+ is_word_start (word_char_class curr_class, word_char_class prev_class)
602+ {
603+ if (curr_class == word_char_class::word
604+ && prev_class != word_char_class::word)
605+ {
606+ return true ;
607+ }
608+ if (curr_class == word_char_class::symbol
609+ && prev_class == word_char_class::space)
610+ {
611+ return true ;
612+ }
613+ return false ;
586614}
587615
588616std::optional<int >
589617string_fragment::next_word (const int start_col) const
590618{
591619 auto index = this ->sf_begin ;
592620 int curr_col = 0 ;
593- auto in_word = false ;
621+ auto prev_class = word_char_class::space ;
594622
595623 while (index < this ->sf_end ) {
596624 auto read_res = ww898::utf::utf8::read (
597625 [this , &index]() { return this ->sf_string [index++]; });
598626 if (read_res.isErr ()) {
599627 curr_col += 1 ;
600- } else {
601- auto ch = read_res.unwrap ();
628+ continue ;
629+ }
630+ auto ch = read_res.unwrap ();
631+
632+ if (ch == ' \t ' ) {
633+ prev_class = word_char_class::space;
634+ do {
635+ curr_col += 1 ;
636+ } while (curr_col % 8 );
637+ continue ;
638+ }
602639
603- switch (ch) {
604- case ' \t ' :
605- do {
606- curr_col += 1 ;
607- } while (curr_col % 8 );
608- break ;
609- default : {
610- auto wcw_res = uc_width (read_res.unwrap (), " UTF-8" );
611- if (wcw_res < 0 ) {
612- wcw_res = 1 ;
613- }
640+ auto wcw_res = uc_width (ch, " UTF-8" );
641+ if (wcw_res < 0 ) {
642+ wcw_res = 1 ;
643+ }
614644
615- if (curr_col == start_col) {
616- in_word = !iswordbreak (ch);
617- } else if (curr_col > start_col) {
618- if (in_word) {
619- if (iswordbreak (ch)) {
620- in_word = false ;
621- }
622- } else if (!iswordbreak (ch)) {
623- return curr_col;
624- }
625- }
626- curr_col += wcw_res;
627- break ;
628- }
629- }
645+ auto curr_class = classify_word_char (ch);
646+ if (curr_col > start_col && is_word_start (curr_class, prev_class)) {
647+ return curr_col;
630648 }
649+ prev_class = curr_class;
650+ curr_col += wcw_res;
631651 }
632652
633653 return std::nullopt ;
@@ -638,50 +658,130 @@ string_fragment::prev_word(const int start_col) const
638658{
639659 auto index = this ->sf_begin ;
640660 int curr_col = 0 ;
641- auto in_word = false ;
661+ auto prev_class = word_char_class::space ;
642662 std::optional<int > last_word_col;
643663
644664 while (index < this ->sf_end ) {
645665 auto read_res = ww898::utf::utf8::read (
646666 [this , &index]() { return this ->sf_string [index++]; });
647667 if (read_res.isErr ()) {
648668 curr_col += 1 ;
649- } else {
650- auto ch = read_res.unwrap ();
651-
652- switch (ch) {
653- case ' \t ' :
654- do {
655- curr_col += 1 ;
656- } while (curr_col % 8 );
657- break ;
658- default : {
659- auto wcw_res = uc_width (read_res.unwrap (), " UTF-8" );
660- if (wcw_res < 0 ) {
661- wcw_res = 1 ;
662- }
669+ continue ;
670+ }
671+ auto ch = read_res.unwrap ();
663672
664- if (curr_col == start_col) {
665- return last_word_col;
666- }
667- if (iswordbreak (ch)) {
668- in_word = false ;
669- } else {
670- if (!in_word) {
671- last_word_col = curr_col;
672- }
673- in_word = true ;
674- }
675- curr_col += wcw_res;
676- break ;
677- }
673+ if (ch == ' \t ' ) {
674+ if (curr_col >= start_col) {
675+ return last_word_col;
678676 }
677+ prev_class = word_char_class::space;
678+ do {
679+ curr_col += 1 ;
680+ } while (curr_col % 8 );
681+ continue ;
682+ }
683+
684+ if (curr_col >= start_col) {
685+ return last_word_col;
686+ }
687+
688+ auto wcw_res = uc_width (ch, " UTF-8" );
689+ if (wcw_res < 0 ) {
690+ wcw_res = 1 ;
679691 }
692+
693+ auto curr_class = classify_word_char (ch);
694+ if (is_word_start (curr_class, prev_class)) {
695+ last_word_col = curr_col;
696+ }
697+ prev_class = curr_class;
698+ curr_col += wcw_res;
680699 }
681700
682701 return last_word_col;
683702}
684703
704+ std::optional<int >
705+ string_fragment::curr_word (const int start_col) const
706+ {
707+ auto index = this ->sf_begin ;
708+ int curr_col = 0 ;
709+ auto prev_class = word_char_class::space;
710+ std::optional<int > last_word_col;
711+
712+ while (index < this ->sf_end ) {
713+ auto read_res = ww898::utf::utf8::read (
714+ [this , &index]() { return this ->sf_string [index++]; });
715+ if (read_res.isErr ()) {
716+ curr_col += 1 ;
717+ continue ;
718+ }
719+ auto ch = read_res.unwrap ();
720+
721+ if (ch == ' \t ' ) {
722+ if (curr_col >= start_col) {
723+ return std::nullopt ;
724+ }
725+ prev_class = word_char_class::space;
726+ do {
727+ curr_col += 1 ;
728+ } while (curr_col % 8 );
729+ continue ;
730+ }
731+
732+ auto wcw_res = uc_width (ch, " UTF-8" );
733+ if (wcw_res < 0 ) {
734+ wcw_res = 1 ;
735+ }
736+
737+ auto curr_class = classify_word_char (ch);
738+
739+ if (start_col < curr_col + wcw_res) {
740+ if (curr_class == word_char_class::space) {
741+ return std::nullopt ;
742+ }
743+ if (is_word_start (curr_class, prev_class)) {
744+ return curr_col;
745+ }
746+ return last_word_col;
747+ }
748+
749+ if (is_word_start (curr_class, prev_class)) {
750+ last_word_col = curr_col;
751+ }
752+ prev_class = curr_class;
753+ curr_col += wcw_res;
754+ }
755+
756+ return std::nullopt ;
757+ }
758+
759+ std::string
760+ string_fragment::transform_codepoints (
761+ const std::function<uint32_t (uint32_t )>& xform) const
762+ {
763+ std::string out;
764+ out.reserve (this ->length ());
765+
766+ auto index = this ->sf_begin ;
767+ while (index < this ->sf_end ) {
768+ auto byte_before = index;
769+ auto read_res = ww898::utf::utf8::read (
770+ [this , &index]() { return this ->sf_string [index++]; });
771+ if (read_res.isErr ()) {
772+ for (auto j = byte_before; j < index; ++j) {
773+ out.push_back (this ->sf_string [j]);
774+ }
775+ continue ;
776+ }
777+ auto cp = read_res.unwrap ();
778+ auto new_cp = xform (cp);
779+ ww898::utf::utf8::write (
780+ new_cp, [&out](const char b) { out.push_back (b); });
781+ }
782+ return out;
783+ }
784+
685785size_t
686786string_fragment::column_width () const
687787{
0 commit comments