<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body>
    <div class="moz-cite-prefix">On 10/13/20 5:29 PM, Shawn Steele
      wrote:<br>
    </div>
    <blockquote type="cite"
cite="mid:DM6PR00MB06652F948FE5AC12EE2D84FB82041@DM6PR00MB0665.namprd00.prod.outlook.com">
      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
      <meta name="Generator" content="Microsoft Word 15 (filtered
        medium)">
      <style><!--
/* Font Definitions */
@font-face
        {font-family:Wingdings;
        panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:"Yu Gothic";
        panose-1:2 11 4 0 0 0 0 0 0 0;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:"Segoe UI Emoji";
        panose-1:2 11 5 2 4 2 4 2 2 3;}
@font-face
        {font-family:"\@Yu Gothic";
        panose-1:2 11 4 0 0 0 0 0 0 0;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0in;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:blue;
        text-decoration:underline;}
span.EmailStyle20
        {mso-style-type:personal-reply;
        font-family:"Calibri",sans-serif;
        color:windowtext;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:8.5in 11.0in;
        margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:228997977;
        mso-list-template-ids:-448384510;}
@list l0:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l0:level2
        {mso-level-number-format:bullet;
        mso-level-text:o;
        mso-level-tab-stop:1.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:"Courier New";
        mso-bidi-font-family:"Times New Roman";}
@list l0:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:1.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l0:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l0:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l0:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l0:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l0:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l0:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l1
        {mso-list-id:1087655170;
        mso-list-template-ids:892095818;}
@list l1:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:1.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:1.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2
        {mso-list-id:1462188070;
        mso-list-template-ids:-2055285018;}
@list l2:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:1.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:1.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l2:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l3
        {mso-list-id:1913008283;
        mso-list-template-ids:-1333123198;}
@list l3:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l3:level2
        {mso-level-number-format:bullet;
        mso-level-text:o;
        mso-level-tab-stop:1.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:"Courier New";
        mso-bidi-font-family:"Times New Roman";}
@list l3:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:1.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l3:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l3:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:2.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l3:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l3:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:3.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l3:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.0in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
@list l3:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:4.5in;
        mso-level-number-position:left;
        text-indent:-.25in;
        mso-ansi-font-size:10.0pt;
        font-family:Wingdings;}
ol
        {margin-bottom:0in;}
ul
        {margin-bottom:0in;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
      <div class="WordSection1">
        <p class="MsoNormal">>   The "use something other than a BOM"
          could mean adding a command line option, adding a menu option,
          remembering what encoding was used for that file last time,
          performing a heuristic analysis (that may or may not include
          the presence of a BOM in its calculation), prompting the user,
          etc...<o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <p class="MsoNormal">That’s the catch.  “Adding… adding…
          remembering… performing.”  If the code was doing the best
          practices/right thing, it’d be using UTF-8.  It isn’t, and
          it’s sort of a given that it’s legacy behavior.  Therefore
          “adding”, etc. means that changes have to happen to the
          applications &/or processes.  Which aren’t necessarily
          going to be deployed promptly, if at all.  This isn’t a
          problem that a standard or best practices can solve. 
          <br>
          <br>
          Everyone already knows the best practice:  “Use UTF-8”.  Any
          resources/effort is going to be getting toward that best
          practice, not edge cases of legacy behaviors that are
          offshoots of something that isn’t the desired end state of
          “use UTF-8”.</p>
      </div>
    </blockquote>
    <p>My goal is exactly to ease migration to that end state.  We can't
      reasonably synchronize a migration of all C++ projects to UTF-8. 
      To get to that end state, we'll have to enable C++ projects to
      independently transition to UTF-8.  Such independent transition
      will be eased by having a portable means to indicate that a source
      file is UTF-8 encoded in such a way that a C++ compiler can
      process it correctly when it is #included from a differently
      encoded source file.  This would suffice for a project to migrate
      to UTF-8 while being usable (e.g., having its header files
      #included) by another UTF-8 encoded project, a Windows-1252
      encoded project, or an EBCDIC encoded project.  Those other
      projects can then migrate to UTF-8 on their own schedule.</p>
    <p>Use of a BOM would be one way to get to that desired end state
      but, as you mentioned, a BOM isn't a great way to identify UTF-8
      data.  The Unicode standard already admits this with the quoted
      "not recommended" text, but it lacks the rationale to defend that
      recommendation or to explain when it may be appropriate to
      disregard that recommendation.  My goal with this paper is to fill
      that hole.  If you don't care for how I've proposed it to be
      filled, that is certainly ok and alternative suggestions are
      welcome.<br>
    </p>
    <blockquote type="cite"
cite="mid:DM6PR00MB06652F948FE5AC12EE2D84FB82041@DM6PR00MB0665.namprd00.prod.outlook.com">
      <div class="WordSection1">
        <p class="MsoNormal"><o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <p class="MsoNormal">I sympathize with the problem, since I
          encounter variations of it every day, I just don’t think any
          tweaking of this text will have any practical impact with
          moving the needle.</p>
      </div>
    </blockquote>
    <p>That is entirely possible.</p>
    <p>Tom.<br>
    </p>
    <blockquote type="cite"
cite="mid:DM6PR00MB06652F948FE5AC12EE2D84FB82041@DM6PR00MB0665.namprd00.prod.outlook.com">
      <div class="WordSection1">
        <p class="MsoNormal"><o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <p class="MsoNormal">-Shawn<o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <div>
          <div style="border:none;border-top:solid #E1E1E1
            1.0pt;padding:3.0pt 0in 0in 0in">
            <p class="MsoNormal"><b>From:</b> Tom Honermann
              <a class="moz-txt-link-rfc2396E" href="mailto:tom@honermann.net"><tom@honermann.net></a> <br>
              <b>Sent:</b> Tuesday, October 13, 2020 2:06 PM<br>
              <b>To:</b> Shawn Steele
              <a class="moz-txt-link-rfc2396E" href="mailto:Shawn.Steele@microsoft.com"><Shawn.Steele@microsoft.com></a>; Alisdair Meredith
              <a class="moz-txt-link-rfc2396E" href="mailto:alisdairm@me.com"><alisdairm@me.com></a><br>
              <b>Cc:</b> <a class="moz-txt-link-abbreviated" href="mailto:sg16@lists.isocpp.org">sg16@lists.isocpp.org</a>; Unicode Mail List
              <a class="moz-txt-link-rfc2396E" href="mailto:unicode@unicode.org"><unicode@unicode.org></a><br>
              <b>Subject:</b> Re: [SG16] Draft proposal: Clarify
              guidance for use of a BOM as a UTF-8 encoding signature<o:p></o:p></p>
          </div>
        </div>
        <p class="MsoNormal"><o:p> </o:p></p>
        <div>
          <p class="MsoNormal">On 10/13/20 4:42 PM, Shawn Steele wrote:<o:p></o:p></p>
        </div>
        <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
          <p class="MsoNormal">My assertion is that if the application
            cannot change to UTF-8 due to legacy considerations, that
            the subtleties of whether to use a BOM or not also cannot be
            prescribed.  If the application could follow best practices,
            it would use UTF-8.  Since it cannot use UTF-8, therefore it
            can’t follow any prescribed behavior.  Therefore anything
            beyond “Use Unicode!” is merely suggestions.  Terminology
            like “require” implies a false sense of rigor that these
            applications can’t follow in practice.<o:p></o:p></p>
        </blockquote>
        <p>This is why the prescription remains abstract:<o:p></o:p></p>
        <ul type="disc">
          <li class="MsoNormal"
            style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l0
            level1 lfo1">
            If possible, use something other than a BOM.<o:p></o:p></li>
          <li class="MsoNormal"
            style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l0
            level1 lfo1">
            As a last resort, use a BOM.<o:p></o:p></li>
        </ul>
        <p>I am effectively proposing that as a best practice.<o:p></o:p></p>
        <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">Eg:  Presume I have a text editor that
            has been used in some context for some time.  If I’m told
            “use UTF-8”, that’s cool, I could try to do that, but if I
            cannot, then I’m in an exceptional path.  Unicode could
            suggest that I consider behavior for BOMs (such as ignoring
            them if present), however I’m already stuck in my legacy
            behavior, so there’s a limit to what my application can do.<o:p></o:p></p>
        </blockquote>
        <p class="MsoNormal">This scenario fits the advice above.  The
          "use something other than a BOM" could mean adding a command
          line option, adding a menu option, remembering what encoding
          was used for that file last time, performing a heuristic
          analysis (that may or may not include the presence of a BOM in
          its calculation), prompting the user, etc...<br>
          <br>
          <o:p></o:p></p>
        <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">However, if Unicode says “if you see a
            BOM, then you must use UTF-8”, then users of my legacy
            application that is difficult to change, may have
            expectations of the application that don’t match reality. 
            They could even enter bugs like “The app isn’t recognizing
            data being tagged with BOMs.”  Or “your system isn’t
            compliant, so we can’t license it.”  If the app could
            properly handle UTF-8, we’d have been captured in the first
            requirements and wouldn’t even be having this part of the
            conversation.  Since they can’t handle UTF-8, trying to
            enforce it through the BOM isn’t going to add much.<o:p></o:p></p>
        </blockquote>
        <p class="MsoNormal">No part of this proposal states "if you see
          a BOM, then you must use UTF-8".  It only suggests guidelines;
          requirements are imposed by protocols as deemed appropriate by
          the protocol designers.<br>
          <br>
          <o:p></o:p></p>
        <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">IMO it’s better that everyone involved
            understand that this legacy app that can’t handle UTF-8 by
            default isn’t necessarily going to behave per any set
            expectations and likely has legacy behaviors that users may
            need to deal with. 
            <o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">Granted, the difference between
            “requiring,” and “suggesting” or “recommending”, may be
            subtle, however those subtleties can sometimes cause
            unnecessary pain.<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">I don’t mind mandating UTF-8 without BOM
            if possible.  I don’t really mind mandating that BOMs be
            ignored if “without BOM” isn’t reasonable to mandate.<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">After that though, it’s trying to create
            a higher order protocol for codepage detection.  BOM isn’t a
            great way to identify UTF-8 data.  (It’s probably more
            effective to decode it as UTF-8.  If it decodes properly,
            then it’s likely UTF-8.  With a certainty of about as many
            “nines” as you have bytes of input.  Linguistically
            appropriate strings that fail that test are rare.)<o:p></o:p></p>
        </blockquote>
        <p>We are agreed on these points.<o:p></o:p></p>
        <p>Tom.<o:p></o:p></p>
        <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">-Shawn<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <div>
            <div style="border:none;border-top:solid #E1E1E1
              1.0pt;padding:3.0pt 0in 0in 0in">
              <p class="MsoNormal"><b>From:</b> Tom Honermann <a
                  href="mailto:tom@honermann.net" moz-do-not-send="true">
                  <tom@honermann.net></a> <br>
                <b>Sent:</b> Tuesday, October 13, 2020 1:04 PM<br>
                <b>To:</b> Shawn Steele <a
                  href="mailto:Shawn.Steele@microsoft.com"
                  moz-do-not-send="true"><Shawn.Steele@microsoft.com></a>;
                Alisdair Meredith
                <a href="mailto:alisdairm@me.com" moz-do-not-send="true"><alisdairm@me.com></a><br>
                <b>Cc:</b> <a href="mailto:sg16@lists.isocpp.org"
                  moz-do-not-send="true">sg16@lists.isocpp.org</a>;
                Unicode Mail List
                <a href="mailto:unicode@unicode.org"
                  moz-do-not-send="true"><unicode@unicode.org></a><br>
                <b>Subject:</b> Re: [SG16] Draft proposal: Clarify
                guidance for use of a BOM as a UTF-8 encoding signature<o:p></o:p></p>
            </div>
          </div>
          <p class="MsoNormal"> <o:p></o:p></p>
          <div>
            <p class="MsoNormal">On 10/12/20 4:54 PM, Shawn Steele
              wrote:<o:p></o:p></p>
          </div>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <p class="MsoNormal">I’m having trouble with the attempt to
              be this prescriptive.<br>
              <br>
              These make sense:  “Use Unicode!”<o:p></o:p></p>
            <ul type="disc">
              <li class="MsoNormal"
                style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l3
                level1 lfo4">
                If possible, mandate use of UTF-8 without a BOM;
                diagnose the presence of a BOM in consumed text as an
                error, and produce text without a BOM.<o:p></o:p></li>
              <li class="MsoNormal"
                style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l3
                level1 lfo4">
                Alternatively, swallow the BOM if present.<o:p></o:p></li>
            </ul>
            <p class="MsoNormal"
              style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">After
              that the situation is clearly hopeless.  Applications
              should Use Unicode, eg: UTF-8, and clearly there are cases
              happening where that isn’t happening.  Trying to prescribe
              that negotiation should therefore happen, or that BOMs
              should be interpreted or whatever is fairly meaningless at
              that point.  Given that the higher-order guidance of “Use
              Unicode” has already been ignored, at this point it’s
              garbage-in, garbage-out.  Clearly the app/whatever is
              ignoring the “use unicode” guidance for some legacy
              reason.  If they could adapt, it should be to use UTF-8. 
               It *<b>might</b>* be helpful to say something about a BOM
              likely indicating UTF-8 text in otherwise unspecified
              data, but prescriptive stuff is pointless, it’s legacy
              stuff that behaves in a legacy fashion for a reason and
              saying they should have done it differently 20 years ago
              isn’t going to help
              <span style="font-family:"Segoe UI
                Emoji",sans-serif">😊</span>  <o:p></o:p></p>
          </blockquote>
          <p>There are applications that, for legacy reasons, are unable
            to change their default encoding to UTF-8, but that also
            need to handle UTF-8 text.  It is not clear to me that such
            situations are hopeless or that they cannot be improved.<o:p></o:p></p>
          <p>The prescription offered follows what you suggest.  The
            first three cases are are all of the "use Unicode!"
            variety.  The distinction between the third and the fourth
            is to relegate use of a BOM as an encoding signature to the
            last resort option.  The intent is to make it clear, with
            stronger motivation than is currently present in the Unicode
            standard, that use of a BOM in UTF-8 is not a best practice
            today.<o:p></o:p></p>
          <p>Tom.<o:p></o:p></p>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <p class="MsoNormal"
              style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">-Shawn<o:p></o:p></p>
            <p class="MsoNormal"> <o:p></o:p></p>
            <div>
              <div style="border:none;border-top:solid #E1E1E1
                1.0pt;padding:3.0pt 0in 0in 0in">
                <p class="MsoNormal"><b>From:</b> Unicode <a
                    href="mailto:unicode-bounces@unicode.org"
                    moz-do-not-send="true">
                    <unicode-bounces@unicode.org></a> <b>On
                    Behalf Of </b>Tom Honermann via Unicode<br>
                  <b>Sent:</b> Monday, October 12, 2020 7:03 AM<br>
                  <b>To:</b> Alisdair Meredith <a
                    href="mailto:alisdairm@me.com"
                    moz-do-not-send="true"><alisdairm@me.com></a><br>
                  <b>Cc:</b> <a href="mailto:sg16@lists.isocpp.org"
                    moz-do-not-send="true">sg16@lists.isocpp.org</a>;
                  Unicode List
                  <a href="mailto:unicode@unicode.org"
                    moz-do-not-send="true"><unicode@unicode.org></a><br>
                  <b>Subject:</b> Re: [SG16] Draft proposal: Clarify
                  guidance for use of a BOM as a UTF-8 encoding
                  signature<o:p></o:p></p>
              </div>
            </div>
            <p class="MsoNormal"> <o:p></o:p></p>
            <div>
              <p class="MsoNormal">Great, here is the change I'm making
                to address this:<o:p></o:p></p>
            </div>
            <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
              <div>
                <p class="MsoNormal">Protocol designers:<o:p></o:p></p>
              </div>
              <div>
                <ul type="disc">
                  <li class="MsoNormal"
                    style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l3
                    level1 lfo4">
                    If possible, mandate use of UTF-8 without a BOM;
                    diagnose the presence of a BOM in consumed text as
                    an error, and produce text without a BOM.<o:p></o:p></li>
                  <li class="MsoNormal"
                    style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l3
                    level1 lfo4">
                    Otherwise, if possible, mandate use of UTF-8 with or
                    without a BOM; accept and discard a BOM in consumed
                    text, and produce text without a BOM.<o:p></o:p></li>
                  <li class="MsoNormal"
                    style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l3
                    level1 lfo4">
                    Otherwise, if possible, use UTF-8 as the default
                    encoding with use of other encodings negotiated
                    using information other than a BOM; accept and
                    discard a BOM in consumed text, and produce text
                    without a BOM.<o:p></o:p></li>
                  <li class="MsoNormal"
                    style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l3
                    level1 lfo4">
                    Otherwise, require the presence of a BOM to
                    differentiate UTF-8 encoded text in both consumed
                    and produced text<b><span style="color:#009900">
                        unless the absence of a BOM would result in the
                        text being interpreted as an ASCII-based
                        encoding and the UTF-8 text contains no
                        non-ASCII characters (the exception is intended
                        to avoid the addition of a BOM to ASCII text
                        thus rendering such text as non-ASCII)</span></b>.
                    This approach should be reserved for scenarios in
                    which UTF-8 cannot be adopted as a default due to
                    backward compatibility concerns.<o:p></o:p></li>
                </ul>
              </div>
            </blockquote>
            <div>
              <p class="MsoNormal">Tom.<o:p></o:p></p>
            </div>
            <div>
              <p class="MsoNormal"> <o:p></o:p></p>
            </div>
            <div>
              <p class="MsoNormal">On 10/12/20 8:40 AM, Alisdair
                Meredith wrote:<o:p></o:p></p>
            </div>
            <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
              <p class="MsoNormal">That addresses my main concern.
                 Essentially, best practice (for UTF-8) would be no BOM
                unless the document contains code points that require
                multiple code units to express.
                <o:p></o:p></p>
              <div>
                <p class="MsoNormal"> <o:p></o:p></p>
              </div>
              <div>
                <p class="MsoNormal">AlisdairM<o:p></o:p></p>
                <div>
                  <p class="MsoNormal"><br>
                    <br>
                    <br>
                    <br>
                    <o:p></o:p></p>
                  <blockquote
                    style="margin-top:5.0pt;margin-bottom:5.0pt">
                    <div>
                      <p class="MsoNormal">On Oct 11, 2020, at 23:22,
                        Tom Honermann <<a
                          href="mailto:tom@honermann.net"
                          moz-do-not-send="true">tom@honermann.net</a>>
                        wrote:<o:p></o:p></p>
                    </div>
                    <p class="MsoNormal"> <o:p></o:p></p>
                    <div>
                      <div>
                        <div>
                          <p class="MsoNormal">On 10/10/20 7:58 PM,
                            Alisdair Meredith via SG16 wrote:<o:p></o:p></p>
                        </div>
                        <blockquote
                          style="margin-top:5.0pt;margin-bottom:5.0pt">
                          <p class="MsoNormal">One concern I have, that
                            might lead into rationale for the current
                            discouragement,
                            <o:p></o:p></p>
                          <div>
                            <p class="MsoNormal">is that I would hate to
                              see a best practice that pushes a BOM into
                              ASCII files.<o:p></o:p></p>
                          </div>
                          <div>
                            <p class="MsoNormal">One of the nice
                              properties of UTF-8 is that a valid ASCII
                              file (still very common) is<o:p></o:p></p>
                          </div>
                          <div>
                            <p class="MsoNormal">also a valid UTF-8
                              file.  Changing best practice would
                              encourage updating those<o:p></o:p></p>
                          </div>
                          <div>
                            <p class="MsoNormal">files to be no longer
                              ASCII.<o:p></o:p></p>
                          </div>
                        </blockquote>
                        <p class="MsoNormal"
                          style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Thanks,
                          Alisdair.  I think that concern is implicitly
                          addressed by the suggested resolutions, but
                          perhaps that can be made more clear.  One
                          possibility would be to modify the "protocol
                          designer" guidelines to address the case where
                          a protocol's default encoding is ASCII based
                          and to specify that a BOM is only required for
                          UTF-8 text that contains non-ASCII
                          characters.  Would that be helpful?<o:p></o:p></p>
                        <p class="MsoNormal"
                          style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Tom.<o:p></o:p></p>
                        <blockquote
                          style="margin-top:5.0pt;margin-bottom:5.0pt">
                          <div>
                            <p class="MsoNormal"> <o:p></o:p></p>
                          </div>
                          <div>
                            <p class="MsoNormal">AlisdairM<o:p></o:p></p>
                            <div>
                              <p class="MsoNormal"><br>
                                <br>
                                <br>
                                <br>
                                <o:p></o:p></p>
                              <blockquote
                                style="margin-top:5.0pt;margin-bottom:5.0pt">
                                <div>
                                  <p class="MsoNormal">On Oct 10, 2020,
                                    at 14:54, Tom Honermann via SG16
                                    <<a
                                      href="mailto:sg16@lists.isocpp.org"
                                      moz-do-not-send="true">sg16@lists.isocpp.org</a>>
                                    wrote:<o:p></o:p></p>
                                </div>
                                <p class="MsoNormal"> <o:p></o:p></p>
                                <div>
                                  <div>
                                    <p class="MsoNormal"
                                      style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Attached
                                      is a draft proposal for the
                                      Unicode standard that intends to
                                      clarify the current recommendation
                                      regarding use of a BOM in UTF-8
                                      text.  This is follow up to
                                      <a
                                        href="https://corp.unicode.org/pipermail/unicode/2020-June/008713.html"
                                        moz-do-not-send="true">discussion
                                        on the Unicode mailing list</a>
                                      back in June.<o:p></o:p></p>
                                    <p class="MsoNormal"
                                      style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Feedback
                                      is welcome.  I plan to
                                      <a
                                        href="https://www.unicode.org/pending/docsubmit.html"
                                        moz-do-not-send="true">submit</a>
                                      this to the UTC in a week or so
                                      pending review feedback.<o:p></o:p></p>
                                    <p class="MsoNormal"
                                      style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Tom.<o:p></o:p></p>
                                  </div>
                                  <p class="MsoNormal"><Unicode-BOM-guidance.pdf>--
                                    <br>
                                    SG16 mailing list<br>
                                    <a
                                      href="mailto:SG16@lists.isocpp.org"
                                      moz-do-not-send="true">SG16@lists.isocpp.org</a><br>
                                    <a
                                      href="https://lists.isocpp.org/mailman/listinfo.cgi/sg16"
                                      moz-do-not-send="true">https://lists.isocpp.org/mailman/listinfo.cgi/sg16</a><o:p></o:p></p>
                                </div>
                              </blockquote>
                            </div>
                            <p class="MsoNormal"> <o:p></o:p></p>
                          </div>
                          <p class="MsoNormal"><br>
                            <br>
                            <br>
                            <br>
                            <o:p></o:p></p>
                        </blockquote>
                        <p class="MsoNormal"
                          style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
                      </div>
                    </div>
                  </blockquote>
                </div>
                <p class="MsoNormal"> <o:p></o:p></p>
              </div>
            </blockquote>
            <p> <o:p></o:p></p>
          </blockquote>
          <p> <o:p></o:p></p>
        </blockquote>
        <p><o:p> </o:p></p>
      </div>
    </blockquote>
    <p><br>
    </p>
  </body>
</html>