1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
//! UTF-8 和 UTF-16 解码迭代器

use crate::fmt;

use super::from_u32_unchecked;

/// 一个迭代器,用于解码来自 u16 迭代器的 UTF-16 编码的代码点。
#[stable(feature = "decode_utf16", since = "1.9.0")]
#[derive(Clone, Debug)]
pub struct DecodeUtf16<I>
where
    I: Iterator<Item = u16>,
{
    iter: I,
    buf: Option<u16>,
}

/// 解码 UTF-16 代码点时可以返回的错误。
#[stable(feature = "decode_utf16", since = "1.9.0")]
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct DecodeUtf16Error {
    code: u16,
}

/// 在 `iter` 中的 UTF-16 编码的代码点上创建一个迭代器,将不成对的代理返回为 `Err`s。
///
///
/// # Examples
///
/// 基本用法:
///
/// ```
/// use std::char::decode_utf16;
///
/// // 𝄞mus<invalid>ic<invalid>
/// let v = [
///     0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834,
/// ];
///
/// assert_eq!(
///     decode_utf16(v.iter().cloned())
///         .map(|r| r.map_err(|e| e.unpaired_surrogate()))
///         .collect::<Vec<_>>(),
///     vec![
///         Ok('𝄞'),
///         Ok('m'), Ok('u'), Ok('s'),
///         Err(0xDD1E),
///         Ok('i'), Ok('c'),
///         Err(0xD834)
///     ]
/// );
/// ```
///
/// 通过用替换字符替换 `Err` 结果,可以获得有损解码器:
///
/// ```
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
///
/// // 𝄞mus<invalid>ic<invalid>
/// let v = [
///     0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834,
/// ];
///
/// assert_eq!(
///     decode_utf16(v.iter().cloned())
///        .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
///        .collect::<String>(),
///     "𝄞mus�ic�"
/// );
/// ```
#[stable(feature = "decode_utf16", since = "1.9.0")]
#[inline]
pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
    DecodeUtf16 { iter: iter.into_iter(), buf: None }
}

#[stable(feature = "decode_utf16", since = "1.9.0")]
impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
    type Item = Result<char, DecodeUtf16Error>;

    fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> {
        let u = match self.buf.take() {
            Some(buf) => buf,
            None => self.iter.next()?,
        };

        if u < 0xD800 || 0xDFFF < u {
            // SAFETY: 不是代理
            Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
        } else if u >= 0xDC00 {
            // 尾随代理
            Some(Err(DecodeUtf16Error { code: u }))
        } else {
            let u2 = match self.iter.next() {
                Some(u2) => u2,
                // eof
                None => return Some(Err(DecodeUtf16Error { code: u })),
            };
            if u2 < 0xDC00 || u2 > 0xDFFF {
                // 不是尾随的代理,因此我们不是有效的代理对,因此请回绕以在下一次重新编码 u2。
                //
                self.buf = Some(u2);
                return Some(Err(DecodeUtf16Error { code: u }));
            }

            // 一切正常,让我们对其进行解码。
            let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
            // SAFETY: 我们检查了这是合法的 unicode 值
            Some(Ok(unsafe { from_u32_unchecked(c) }))
        }
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        let (low, high) = self.iter.size_hint();
        // 我们可以是完全有效的代理 (每个字符 2 个元素),也可以是完全非代理 (每个字符 1 个元素)
        //
        (low / 2, high)
    }
}

impl DecodeUtf16Error {
    /// 返回导致此错误的未配对代理。
    #[stable(feature = "decode_utf16", since = "1.9.0")]
    pub fn unpaired_surrogate(&self) -> u16 {
        self.code
    }
}

#[stable(feature = "decode_utf16", since = "1.9.0")]
impl fmt::Display for DecodeUtf16Error {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "unpaired surrogate found: {:x}", self.code)
    }
}