arg_router  1.4.0
C++ command line argument parsing and routing
utf8.hpp
1 // Copyright (C) 2022 by Camden Mannett.
2 // Distributed under the Boost Software License, Version 1.0.
3 // (See accompanying file LICENSE or copy at https://www.boost.org/LICENSE_1_0.txt)
4 
5 #pragma once
6 
7 #include "arg_router/utility/utf8/double_width.hpp"
8 #include "arg_router/utility/utf8/grapheme_cluster_break.hpp"
9 #include "arg_router/utility/utf8/line_break.hpp"
10 #include "arg_router/utility/utf8/whitespace.hpp"
11 #include "arg_router/utility/utf8/zero_width.hpp"
12 
13 #include <string_view>
14 
17 {
18 namespace detail
19 {
20 template <std::size_t N>
21 [[nodiscard]] constexpr std::optional<code_point::range> find_range(
22  const std::array<code_point::range, N>& table,
23  code_point::type cp) noexcept
24 {
25  auto first = table.begin();
26 
27  auto len = N;
28  while (len > 0) {
29  const auto step = len / 2;
30  auto halfway = first + step;
31 
32  if (cp < halfway->first()) {
33  len = step;
34  } else if (cp <= halfway->last()) {
35  return *halfway;
36  } else {
37  first = ++halfway;
38  len -= step + 1;
39  }
40  }
41 
42  return {};
43 }
44 } // namespace detail
45 
48 class iterator
49 {
50 public:
52  using difference_type = std::string_view::difference_type;
54  using value_type = std::string_view;
56  using pointer = const value_type*;
58  using reference = const value_type&;
60  using iterator_category = std::forward_iterator_tag;
61 
70  class range_t
71  {
72  public:
73  friend class iterator;
74 
79  [[nodiscard]] constexpr iterator begin() noexcept { return iterator{str_}; }
80 
85  [[nodiscard]] constexpr static iterator end() noexcept { return iterator{}; }
86 
87  private:
88  constexpr explicit range_t(std::string_view str) noexcept : str_{str} {}
89 
90  std::string_view str_;
91  };
92 
98  [[nodiscard]] constexpr static range_t range(std::string_view str) noexcept
99  {
100  return range_t{str};
101  }
102 
107  constexpr iterator() noexcept : current_{0}, trailing_window_{} {}
108 
114  constexpr explicit iterator(std::string_view str) noexcept :
115  current_{0}, str_{str}, trailing_window_{}
116  {
117  fill_trailing_window();
118  update_current();
119  }
120 
126  [[nodiscard]] constexpr bool operator==(iterator other) const noexcept
127  {
128  // If they are both empty, then they are considered both end iterators and therefore equal
129  if (str_.empty() && other.str_.empty()) {
130  return true;
131  }
132  return (str_.data() == other.str_.data()) && (str_.size() == other.str_.size());
133  }
134 
140  [[nodiscard]] constexpr bool operator!=(iterator other) const noexcept
141  {
142  return !(*this == other);
143  }
144 
149  [[nodiscard]] constexpr value_type operator*() const noexcept
150  {
151  return str_.substr(0, current_);
152  }
153 
158  constexpr iterator& operator++() noexcept
159  {
160  // Remove the leading cluster
161  str_.remove_prefix(current_);
162 
163  // Find the end of the new one
164  update_current();
165 
166  return *this;
167  }
168 
173  constexpr iterator operator++(int) noexcept
174  {
175  auto result = *this;
176 
177  ++(*this);
178  return result;
179  }
180 
181 private:
182  constexpr static auto trailing_window_size = std::size_t{AR_UTF8_TRAILING_WINDOW_SIZE};
183 
184  [[nodiscard]] constexpr static grapheme_cluster_break_class extract_class(
185  code_point::type cp) noexcept
186  {
187  auto result = grapheme_cluster_break_class::any;
188  const auto range = detail::find_range(grapheme_cluster_break_table, cp);
189  if (range) {
190  result = static_cast<grapheme_cluster_break_class>(range->meta());
191  }
192 
193  return result;
194  }
195 
196  [[nodiscard]] constexpr bool should_break(grapheme_cluster_break_class next_class) noexcept
197  {
198  for (auto rule : no_break_rules::grapheme_cluster<trailing_window_size>) {
199  if (rule(trailing_window_, next_class)) {
200  return false;
201  }
202  }
203 
204  return true;
205  }
206 
207  constexpr void fill_trailing_window() noexcept
208  {
209  // array::fill(..) is only constexpr in C++20
210  for (auto& p : trailing_window_) {
211  p = grapheme_cluster_break_class::any;
212  }
213  }
214 
215  constexpr void rotate_trailing_window() noexcept
216  {
217  // Right rotate the window, std::rotate isn't constexpr in C++17
218  auto it = trailing_window_.rbegin();
219  while (true) {
220  auto prev = it++;
221  if (it == trailing_window_.rend()) {
222  break;
223  }
224  *prev = *it;
225  }
226  }
227 
228  constexpr void update_current() noexcept
229  {
230  current_ = 0;
231  if (str_.empty()) {
232  return;
233  }
234 
235  // Iterate over each code point and its neighbour, pass their break classes into the rule
236  // checker
237  for (auto it = code_point::iterator{str_}; it != code_point::iterator{}; ++it) {
238  current_ += code_point::size(*it);
239 
240  // If this code point is malformed, then just skip it
241  const auto this_cp = code_point::decode(*it);
242  if (!this_cp) {
243  continue;
244  }
245 
246  rotate_trailing_window();
247  trailing_window_.front() = extract_class(*this_cp);
248 
249  // If there is a following code point, use it to update the next_break_class
250  auto next_break_class = grapheme_cluster_break_class::any;
251  {
252  auto next_it = it;
253  ++next_it;
254  if (next_it != code_point::iterator{}) {
255  const auto next_cp = code_point::decode(*next_it);
256  if (next_cp) {
257  next_break_class = extract_class(*next_cp);
258  }
259  }
260  }
261 
262  if (should_break(next_break_class)) {
263  break;
264  }
265  }
266  }
267 
268  std::size_t current_;
269  std::string_view str_;
270  std::array<grapheme_cluster_break_class, trailing_window_size> trailing_window_;
271 };
272 
278 [[nodiscard]] inline constexpr std::size_t count(std::string_view str) noexcept
279 {
280  return std::distance(iterator(str), iterator());
281 }
282 
289 [[nodiscard]] inline constexpr bool is_whitespace(std::string_view str) noexcept
290 {
291  const auto cp = code_point::decode(str);
292  if (!cp) {
293  return false;
294  }
295 
296  return !!detail::find_range(whitespace_table, *cp);
297 }
298 
304 [[nodiscard]] inline constexpr bool contains_whitespace(std::string_view str) noexcept
305 {
306  for (auto cp : code_point::iterator::range(str)) {
307  if (is_whitespace(cp)) {
308  return true;
309  }
310  }
311 
312  return false;
313 }
314 
321 [[nodiscard]] inline constexpr std::size_t terminal_width(std::string_view str) noexcept
322 {
323  auto width = std::size_t{0};
324  for (auto cp_str : code_point::iterator::range(str)) {
325  const auto cp = code_point::decode(cp_str);
326  if (!cp) {
327  continue;
328  }
329 
330  if (detail::find_range(double_width_table, *cp)) {
331  width += 2;
332  } else if (!detail::find_range(zero_width_table, *cp)) {
333  width += 1;
334  }
335  }
336 
337  return width;
338 }
339 
344 {
345 public:
347  using difference_type = std::string_view::difference_type;
349  using value_type = std::string_view;
351  using pointer = const value_type*;
353  using reference = const value_type&;
355  using iterator_category = std::forward_iterator_tag;
356 
361  constexpr line_iterator() noexcept : max_columns_{0}, line_break_byte_{0}, trailing_window_{} {}
362 
370  constexpr explicit line_iterator(std::string_view str, std::size_t max_columns) noexcept :
371  str_{str}, max_columns_{max_columns}, line_break_byte_{0}, trailing_window_{}
372  {
373  if (max_columns_ == 0) {
374  str_ = std::string_view{};
375  }
376 
377  fill_trailing_window();
378  consume();
379  }
380 
385  [[nodiscard]] constexpr std::size_t max_columns() const noexcept { return max_columns_; }
386 
392  [[nodiscard]] constexpr bool operator==(line_iterator other) const noexcept
393  {
394  // If they are both empty, then they are considered both end iterators and therefore equal
395  if (str_.empty() && other.str_.empty()) {
396  return true;
397  }
398  return (str_.data() == other.str_.data()) && (str_.size() == other.str_.size()) &&
399  (max_columns_ == other.max_columns_) && (line_break_byte_ == other.line_break_byte_);
400  }
401 
407  [[nodiscard]] constexpr bool operator!=(line_iterator other) const noexcept
408  {
409  return !(*this == other);
410  }
411 
416  [[nodiscard]] constexpr value_type operator*() const noexcept
417  {
418  return str_.substr(0, line_break_byte_);
419  }
420 
421  constexpr line_iterator& operator++() noexcept
422  {
423  consume();
424  return *this;
425  }
426 
431  constexpr line_iterator operator++(int) noexcept
432  {
433  auto result = *this;
434 
435  ++(*this);
436  return result;
437  }
438 
439 private:
440  constexpr static auto trailing_window_size = std::size_t{AR_UTF8_TRAILING_WINDOW_SIZE};
441 
442  [[nodiscard]] constexpr static line_break_class extract_class(code_point::type cp) noexcept
443  {
444  auto result = line_break_class::any;
445  const auto range = detail::find_range(line_break_table, cp);
446  if (range) {
447  result = static_cast<line_break_class>(range->meta());
448  }
449 
450  return result;
451  }
452 
453  [[nodiscard]] constexpr bool should_break(line_break_class next_class) noexcept
454  {
455  for (auto rule : no_break_rules::line_break<trailing_window_size>) {
456  if (rule(trailing_window_, next_class)) {
457  return false;
458  }
459  }
460 
461  return true;
462  }
463 
464  constexpr void fill_trailing_window() noexcept
465  {
466  // array::fill(..) is only constexpr in C++20
467  for (auto& p : trailing_window_) {
468  p = line_break_class::any;
469  }
470  }
471 
472  constexpr void rotate_trailing_window() noexcept
473  {
474  // Right rotate the window, std::rotate isn't constexpr in C++17
475  auto it = trailing_window_.rbegin();
476  while (true) {
477  auto prev = it++;
478  if (it == trailing_window_.rend()) {
479  break;
480  }
481  *prev = *it;
482  }
483  }
484 
485  constexpr void consume() noexcept
486  {
487  if (str_.empty()) {
488  return;
489  }
490 
491  // Start by consuming the previous line
492  if (line_break_byte_ != 0) {
493  str_.remove_prefix(line_break_byte_);
494  line_break_byte_ = 0;
495 
496  // Consuming any leading whitespace
497  auto bytes = std::size_t{0};
498  for (auto cp : code_point::iterator::range(str_)) {
499  if (!is_whitespace(cp)) {
500  break;
501  }
502  bytes += code_point::size(cp);
503  }
504  str_.remove_prefix(bytes);
505 
506  if (str_.empty()) {
507  return;
508  }
509  }
510 
511  // Iterate over the code points until you reach or exceed the column limit, setting the
512  // break markers at each whitespace code point
513  auto column = std::size_t{0};
514  auto bytes = std::size_t{0};
515  auto line_break_column = std::size_t{0};
516  auto line_break_byte = std::size_t{0};
517  for (auto it = code_point::iterator{str_}; it != code_point::iterator{}; ++it) {
518  // Have we exceeded the terminal width?
519  column += terminal_width(*it);
520  if (column > max_columns_) {
521  if (line_break_column == 0) {
522  // Oh dear, the line has no whitespace in it to break on, so force break on the
523  // last code point
524  line_break_byte_ = bytes;
525  } else {
526  line_break_byte_ = line_break_byte;
527  }
528  return;
529  }
530 
531  bytes += code_point::size(*it);
532 
533  // If this code point is malformed, then just skip it
534  const auto this_cp = code_point::decode(*it);
535  if (!this_cp) {
536  continue;
537  }
538 
539  rotate_trailing_window();
540  trailing_window_.front() = extract_class(*this_cp);
541 
542  // If there is a following code point, use it to update the next_break_class
543  auto next_break_class = line_break_class::any;
544  {
545  auto next_it = it;
546  ++next_it;
547  if (next_it != code_point::iterator{}) {
548  const auto next_cp = code_point::decode(*next_it);
549  if (next_cp) {
550  next_break_class = extract_class(*next_cp);
551  }
552  }
553  }
554 
555  if (should_break(next_break_class)) {
556  line_break_column = column;
557  line_break_byte = bytes;
558  }
559  }
560 
561  // We haven't hit the max column, so consume the whole string
562  line_break_byte_ = str_.size();
563  }
564 
565  std::string_view str_;
566  std::size_t max_columns_;
567  std::size_t line_break_byte_;
568  std::array<line_break_class, trailing_window_size> trailing_window_;
569 };
570 } // namespace arg_router::utility::utf8
constexpr static range_t range(std::string_view str) noexcept
Definition: code_point.hpp:270
constexpr static iterator end() noexcept
Definition: utf8.hpp:85
constexpr iterator begin() noexcept
Definition: utf8.hpp:79
std::string_view::difference_type difference_type
Definition: utf8.hpp:52
const value_type & reference
Definition: utf8.hpp:58
constexpr static range_t range(std::string_view str) noexcept
Definition: utf8.hpp:98
constexpr iterator(std::string_view str) noexcept
Definition: utf8.hpp:114
std::forward_iterator_tag iterator_category
Definition: utf8.hpp:60
constexpr iterator operator++(int) noexcept
Definition: utf8.hpp:173
std::string_view value_type
Definition: utf8.hpp:54
constexpr bool operator!=(iterator other) const noexcept
Definition: utf8.hpp:140
constexpr iterator & operator++() noexcept
Definition: utf8.hpp:158
constexpr bool operator==(iterator other) const noexcept
Definition: utf8.hpp:126
constexpr iterator() noexcept
Definition: utf8.hpp:107
const value_type * pointer
Definition: utf8.hpp:56
constexpr value_type operator*() const noexcept
Definition: utf8.hpp:149
constexpr value_type operator*() const noexcept
Definition: utf8.hpp:416
constexpr line_iterator(std::string_view str, std::size_t max_columns) noexcept
Definition: utf8.hpp:370
constexpr std::size_t max_columns() const noexcept
Definition: utf8.hpp:385
std::string_view::difference_type difference_type
Definition: utf8.hpp:347
constexpr bool operator==(line_iterator other) const noexcept
Definition: utf8.hpp:392
constexpr line_iterator operator++(int) noexcept
Definition: utf8.hpp:431
std::forward_iterator_tag iterator_category
Definition: utf8.hpp:355
constexpr line_iterator() noexcept
Definition: utf8.hpp:361
constexpr bool operator!=(line_iterator other) const noexcept
Definition: utf8.hpp:407
constexpr std::optional< type > decode(std::string_view str) noexcept
Definition: code_point.hpp:184
constexpr std::size_t size(std::string_view str) noexcept
Definition: code_point.hpp:148
constexpr std::size_t terminal_width(std::string_view str) noexcept
Definition: utf8.hpp:321
constexpr bool contains_whitespace(std::string_view str) noexcept
Definition: utf8.hpp:304
constexpr auto line_break_table
Definition: line_break.hpp:700
constexpr auto zero_width_table
Definition: zero_width.hpp:18
constexpr auto whitespace_table
Definition: whitespace.hpp:18
constexpr bool is_whitespace(std::string_view str) noexcept
Definition: utf8.hpp:289
constexpr std::size_t count(std::string_view str) noexcept
Definition: utf8.hpp:278
constexpr auto double_width_table