-
Notifications
You must be signed in to change notification settings - Fork 0
/
draft-hellstrom-mmusic-multi-party-rtt.xml
1385 lines (1195 loc) · 78.3 KB
/
draft-hellstrom-mmusic-multi-party-rtt.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
There has to be one entity for each item to be referenced.
An alternate method (rfc include) is described in the references. -->
<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC3261 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3261.xml">
<!ENTITY RFC3550 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml">
<!ENTITY RFC4103 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4103.xml">
<!ENTITY RFC4353 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4353.xml">
<!ENTITY RFC4575 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4575.xml">
<!ENTITY RFC4579 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4579.xml">
<!ENTITY RFC4597 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4597.xml">
<!ENTITY RFC7667 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7667.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs),
please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
(Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
(using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="bcp" docName="draft-hellstrom-mmusic-multi-party-rtt-01"
ipr="trust200902">
<!-- category values: std, bcp, info, exp, and historic
ipr values: trust200902, noModificationTrust200902, noDerivativesTrust200902,
or pre5378Trust200902
you can add the attributes updates="NNNN" and obsoletes="NNNN"
they will automatically be output with "(if approved)" -->
<!-- ***** FRONT MATTER ***** -->
<front>
<!-- The abbreviated title is used in the page header - it is only necessary if the
full title is longer than 39 characters -->
<title abbrev="Real-time text multi-party handling">Real-time text media handling in
multi-party conferences</title>
<!-- add 'role="editor"' below for the editors if appropriate -->
<!-- Another author who claims to be an editor -->
<author fullname="Gunnar Hellstrom" initials="G." surname="Hellstrom">
<organization>Omnitor</organization>
<address>
<postal>
<street>Esplanaden 30</street>
<!-- Reorder these if your country does things differently -->
<city>Vendelso</city>
<code>SE-136 70</code>
<country>SE</country>
</postal>
<phone>+46 708 204 288</phone>
<email>[email protected]</email>
<uri>www.omnitor.se</uri>
<!-- uri and facsimile elements may also be added -->
</address>
</author>
<date month="February" year="2020" />
<!-- If the month and year are both specified and are the current ones, xml2rfc will fill
in the current day for you. If only the current year is specified, xml2rfc will fill
in the current day and month for you. If the year is not the current one, it is
necessary to specify at least a month (xml2rfc assumes day="1" if not specified for the
purpose of calculating the expiry date). With drafts it is normally sufficient to
specify just the year. -->
<!-- Meta-data Declarations -->
<area>General</area>
<workgroup>Internet Engineering Task Force</workgroup>
<!-- WG name at the upperleft corner of the doc,
IETF is fine for individual submissions.
If this element is not present, the default is "Network Working Group",
which is used by the RFC Editor as a nod to the history of the IETF. -->
<keyword>Internet-Draft</keyword>
<!-- Keywords will be incorporated into HTML output
files in a meta tag but they have no effect on text or nroff
output. If you submit your draft to the RFC Editor, the
keywords will be used for the search engine. -->
<abstract>
<t>This memo specifies methods for Real-Time Text (RTT) media handling in multi-party
calls. The main solution is to carry Real-Time text by the RTP protocol
in a time-sampled mode according to RFC 4103. The main solution for centralized
multi-party handling of real-time text is achieved through a media
control unit coordinating multiple RTP text streams into one RTP session.</t>
<t>
Identification for the streams are provided through the CSRC lists in the
RTP packets and through the RTCP messages.
This mechanism enables the receiving application to present the received
real-time text medium separated per source, in different ways according to user preferences.
Some presentation related features are also described explaining
suitable variations of transmission and presentation of text.
</t>
<t>Call control features are described for the SIP environment. A number
of alternative methods for providing the multi-party negotiation,
transmission and presentation are discussed and a recommendation
for the main one is provided. Two alternative methods using a single
RTP stream and source identification inline in the text stream are also
described, one of them being provided as a lower functionality fallback
method for endpoints with no multi-party awareness for RTT.</t>
<t>Brief information is also provided for multi-party RTT in the WebRTC environment.</t>
<t>EDITOR NOTE: A number of alternatives are specified for discussion. A
decision is needed which alternatives are preferred and then how
the preferred alternatives shall be emphasized.</t>
</abstract>
</front>
<middle>
<section title="Introduction">
<t>Real-time text (RTT) is a medium in real-time conversational sessions. Text
entered by participants in a session is transmitted in a time-sampled
fashion, so that no specific user action is needed to cause
transmission. This gives a direct flow of text in the rate it is created,
that is suitable in a real-time conversational setting. The real-time text
medium can be combined with other media in multimedia sessions.</t>
<t>Media from a number of multimedia session participants can be combined
in a multi-party session. This memo specifies how the real-time text streams
are handled in multi-party sessions.</t>
<t>The description is mainly focused on the transport level, but also
describes a few session and presentation level aspects.</t>
<t>Transport of real-time text is specified in <xref
target="RFC4103">RFC 4103</xref> RTP Payload for text conversation. It
makes use of <xref target="RFC3550">RFC 3550</xref> Real Time Protocol,
for transport. Robustness against network transmission problems is normally
achieved through redundant transmission based on the principle from RFC 2198,
with one primary and two redundant transmission of each text element. Primary and redundant
transmissions are combined in packets and described by a redundancy header.
This transport is usually used in the SIP Session Initiation Protocol
<xref target="RFC3261">RFC 3261</xref> environment.</t>
<t>A very brief overview of functions for real-time
text handling in multi-party sessions is described in <xref
target="RFC4597">RFC 4597</xref> Conferencing Scenarios, sections 4.8 and 4.10. This
specification builds on that description and indicates which
protocol mechanisms should be used to implement multi-party handling of
real-time text.</t>
<t>EDITOR NOTE: A number of alternatives are specified for discussion. A
decision is needed which alternatives are preferred and then how
the preferred alternatives shall be emphasized.</t>
<section title="Requirements Language">
<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
document are to be interpreted as described in <xref
target="RFC2119">RFC 2119</xref>.</t>
</section>
</section>
<section title="Centralized conference model">
<t>In the centralized conference model for SIP, introduced in <xref
target="RFC4353">RFC 4353</xref> A Framework for Conferencing with the Session
Initiation Protocol (SIP), one function co-ordinates the
communication with participants in the multi-party session. This function
also controls media mixer functions for the media appearing in the
session. The central function is common for control of all media, while
the media mixers may work differently for each medium.</t>
<t>The central function is called the Focus UA and may be co-located in
an advanced terminal including multi-party control functions, or it may
be located in a separate location. Many variants exist for setting up
sessions including the multipoint control centre. It is not within scope
of this description to describe these, but rather the media specific
handling in the mixer required to handle multi-party calls with RTT.</t>
<t>The main principle for handling real-time text media in a centralized
conference is that one RTP session for real-time text is established
including the multipoint media control centre and the participating endpoints which are
going to have real-time text exchange with the others.</t>
<t>The different possible mechanisms for mixing and transporting RTT differs in the way they multiplex
the text streams and how they identify the sources of the streams. <xref
target="RFC7667">RFC 7667</xref> describes a number
of possible use cases for RTP. This specification refers to different sections of RFC 7667
for further reading of the situations caused by the different possible design choices.</t>
</section>
<section title="Requirements on multi-party RTT">
<t>The following requirements are placed on multi-party RTT:</t>
<t>
<list style="empty">
<t>The solution shall be applicable to IMS (3GPP TS 22.173), SIP based
VoIP and Next Generation Emergency Services (NENA i3, ETSI TS 103 479, RFC 6443). </t>
<t>The transmission interval for text must not be longer than 500 milliseconds when
there is anything available to send. Ref ITU-T T.140.</t>
<t>If text loss is detected or suspected, a missing text marker shall be inserted in
the text stream where the loss is detected or suspected. Ref ITU-T T.140 Amendment 1.
ETSI EN 301 549</t>
<t>The display of text from the members of the conversation shall be arranged so that the text from
each participant is clearly readable, and its source and the relative timing of entered text is visualized
in the display. Mechanisms for looking back in the contents from the current session should be
provided. The text should be displayed as soon as it is received. Ref ITU-T T.140 </t>
<t>Bridges must be multimedia capable (voice, video, text). Ref NENA i3 STA-010.2. </t>
<t>R7: It MUST be possible to use real-time text in conferences both as
a medium of discussion between individual participants (for example,
for sidebar discussions in real-time text while listening to the main
conference audio) and for central support of the conference with
real-time text interpretation of speech. Ref RFC 5194. </t>
<t>It should be possible to protect RTT contents with usual means for privacy and integrity.Ref RFC 6881 section 16</t>
<t>Conferencing procedures are documented in RFC 4579. Ref NENA i3 STA-010.2. </t>
<t>Conferencing applies to any kind of media stream by which users may want to communicate...
Ref 3GPP TS 24.147</t>
<t>The framework for SIP conferences is specified in RFC 4353. Ref 3GPP TS 24.147</t>
</list>
</t>
</section>
<section title="Coordination of text RTP streams">
<t>Coordinating and sending text RTP streams in the multi-party session can be done in a number of ways.
The most suitable methods are specified here with pros and cons.</t>
<t> A receiving UA SHOULD separate text from the different sources and
identify and display them accordingly.</t>
<section title="RTP Translator sending one RTT stream per participant">
<t>Within the RTP session, text from each participant is transmitted from the
RTP media translator in a separate RTP stream, thus using the same
destination address/port combination, but separate RTP SSRC parameters and sequence number series as
described in Section 7.1 and 7.2 of RTP <xref target="RFC3550">RFC
3550</xref> about the Translator function. The sources of the text
in each RTP packet are identified by the SSRC parameters in the RTP packets, containing the
SSRC of the initial sources of text.</t>
<t> A receiving UA is supposed to separate text items from the different sources and
identify and display them in a suitable way.</t>
<t>This method is described in RFC 7667,
section 3.5.1 Relay-transport translator or 3.5.2 Media translator.</t>
<t>The identification of the source is made through the RTCP SDES
CNAME and NAME packets as described in RTP<xref
target="RFC3550"></xref>.</t>
<t>Pros:</t>
<t> This method has moderate overhead. When loss of packets occur,
it is possible to recover text from redundancy at loss of up to the number
of redundancy levels carried in the RFC 4103 stream. (normally primary and
two redundant levels. </t>
<t>More loss than what can be recovered, can be detected and the marker for text
loss can be inserted in the correct stream.</t>
<t>It may be possible in some scenarios to keep the text encrypted through the Translator.</t>
<t>Cons:</t>
<t>There may be RTP implementations not supporting the Translator model. </t>
<t>It is even most likely that this configuration is not supported by current media declarations in sdp.
RFC 3264 specifies in many places that one media description is supposed to describe just one RTP stream. </t>
</section>
<section title="RTP Mixer indicating sources in CSRC-list">
<t>
An RTP media mixer combines text from all participants except from the receiving endpoint into one
RTP stream , thus all using the same
destination address/port combination, the same RTP SSRC and , one sequence number series as
described in Section 7.1 and 7.3 of RTP <xref target="RFC3550">RFC
3550</xref> about the Mixer function. The sources of the text
in each RTP packet are identified by the CSRC parameters in the RTP packets, containing the
SSRC of the initial sources of text. The order of the CSRC parameters are the
same as the order of the redundant and primary data fields in the packet. If all redundancy
blocks in a packet are from the same source,
then it is allowed to use only one CSRC in the RTP packet. This method is described in RFC 7667,
section 3.6.3 Media switching mixer.</t>
<t>A set of specific rules for the application of this method together with RFC 4103 is needed.</t>
<t>The identification of the source can be made through the RTCP SDES
CNAME and NAME packets as described in RTP<xref target="RFC3550"></xref>.</t>
<t>Also information provided through the notification according to RFC 4575 when the participant
joined the conference provides suitable information and a reference to the SSRC.</t>
<t> A receiving UA is supposed to separate text items from the different sources and
identify and display them accordingly.</t>
<t>The ordered CSRC lists in the RFC 4103 packets make it possible to recover from
loss of one and two packets in sequence and assign the recovered text to the right source.
For more loss, a marker for possible loss should be inserted or presented.</t>
<t>The conference server need to have authority to decrypt the payload in
the RTP packets in order to be able to recover text from redundant data or insert the
missing text marker in the stream, and repack the text in new packets.</t>
<t>Pros:</t>
<t> This method has moderate overhead. </t>
<t>When loss of packets occur,
it is possible to recover text from redundancy at loss of up to the number
of redundancy levels carried in the RFC 4103 stream. (normally primary and
two redundant levels.</t>
<t>This method can be implemented with most RTP implementations.</t>
<t></t>
<t>Cons:</t>
<t> When more consecutive packet loss than the number of generations of
redundant data appears, it is not possible to deduct the sources of the totally lost data.
Therefore it is not possible to know in which stream to insert the missing text marker.
It MAY be acceptable to either indicate a general loss indication, or insert a loss marker in all streams.
Calculations of most likely source can however be made from received RTP and RTCP
contents so that the loss marker can be inserted in the most likely struck stream.</t>
<t>The conference server need to be allowed to decrypt/encrypt the packet payload.
This is however normal for media mixers for other media.</t>
</section>
<section title="Distributing packets in an end-to-end encryption structure">
<t>In order to achieve end-to-end encryption, it is possible to let the packets from the sources
just pass though a central distributor, and handle the security agreements between the participants.
Specifications exist for a framework with this functionality suitable for application on RTP based
conferences in draft-ietf-perc-private-media-framework.
The RTP flow and mixing characteristics has similarities with the method described under
"RTP Translator sending one RTT stream per participant" above.
RFC 4103 RTP streams would fit into the structure and it would provide a base for end-to-end encrypted
rtt multi-party conferencing.
</t>
<t>Pros: </t>
<t>Good security </t>
<t>Straightforward multi-party handling.</t>
<t>Cons: </t>
<t>Does not operate under the usual SIP central conferencing architecture.</t>
<t>Requires the participants to perform a lot of key handling.</t>
</section>
<section title="RTP Mixer indicating participants by a control code in the stream">
<t>Text from all participants except the receiving one is transmitted from the
media mixer in the same RTP session and stream, thus all using the same
destination address/port combination, the same RTP SSRC and , one sequence number series as
described in Section 7.1 and 7.3 of RTP <xref target="RFC3550">RFC
3550</xref> about the Mixer function. The sources of the text
in each RTP packet are identified by a new defined T.140 control code "c"
followed by a unique identification of the source in UTF-8 string format.</t>
<t>The receiver can use the string for presenting the source of text. This method is
on the RTP level described in RFC 7667, section 3.6.2 Media mixing mixer.</t>
<t>The inline coding of the source of text is applied in the data stream itself, and an
RTP mixer function is used for
coordinating the sources of text into one RTP stream.</t>
<t>Information uniquely identifying each user in the multi-party session
is placed as the parameter value “n” in the
T.140 application protocol function with the function code “c”. The
identifier shall thus be formatted like this: SOS c n ST, where SOS and ST are
coded as specified in <xref target="T.140">ITU-T T.140</xref>.
The "c" is the letter "c". The n parameter value is a string uniquely identifying
the source. This parameter shall be kept short
so that it can be repeated in the transmission without concerns for
network load.</t>
<t> A receiving UA is supposed to separate text items from the different sources and
identify and display them accordingly.</t>
<t>The conference server need to be allowed to decrypt/encrypt the packet payload in order to check the source and repack the text.</t>
<t>Pros: </t>
<t>If loss of packets occur, it is possible to recover text from redundancy at loss of up to the number
of redundancy levels carried in the RFC 4103 stream. (normally primary and
two redundant levels.</t>
<t>This method can be implemented with most RTP implementations.</t>
<t>Transmitted text can also be used with other transports than RTP</t>
<t>Cons: </t>
<t>If more consecutive packet loss than the number of generations of
redundant data appears, it is not possible to deduct the source of the totally lost data.
Therefore it is not possible to know in which stream to insert the missing text marker.
Calculations of most likely source can however be made from recent history, so that it is quite likely that the marker is inserted in the correct stream. Such loss should however be rare, and a general warning that there might have been text loss in the session might be acceptable.</t>
<t>The mixer needs to be able to generate suitable and unique source identifications
which are suitable as labels for the sources. </t>
<t>Requires an extension on the ITU-T T.140 standard, best made by the ITU. </t>
<t>The conference server need to be allowed to decrypt/encrypt the packet payload.</t>
<t>The conference server need to be allowed to decrypt/encrypt the packet payload.</t>
</section>
<section title="Mesh of RTP endpoints">
<t>Text from all participants are transmitted directly to all others in one RTP session,
without a central bridge. The sources of the text in each RTP packet are identified by
the source network address and the SSRC. </t>
<t>This method is described in RFC 7667, section 3.4 Point to multi-point using mesh.</t>
<t>Pros: </t>
<t>When loss of packets occur, it is possible to recover text from
redundancy at loss of up to the number of redundancy levels carried
in the RFC 4103 stream. (normally primary and two redundant levels.</t>
<t>This method can be implemented with most RTP implementations.</t>
<t>Transmitted text can also be used with other transports than RTP</t>
<t>Cons: </t>
<t>This model is not described in IMS, NENA and EENA specifications, and does therefore not meet the requirements.</t>
</section>
<section title="Multiple RTP sessions, one for each participant">
<t>Text from all participants are transmitted directly to all others in one RTP session each,
without a central bridge. Each session is established with a separate media description in SDP.
The sources of the text in each RTP packet are identified by
the source network address and the SSRC. </t>
<t>This method is out of scope for further discussion here, because the foreseen applications use centralized model conferencing.</t>
<t>Pros: </t>
<t>When loss of packets occur, it is possible to recover text from
redundancy at loss of up to the number of redundancy levels carried
in the RFC 4103 stream. (normally primary and two redundant levels.</t>
<t>Complete loss of text can be indicated in the received stream.</t>
<t>This method can be implemented with most RTP implementations.</t>
<t>End-to-end encryption is achievable.</t>
<t>Cons:</t>
<t> This method is not described in IMS, NENA and EENA specifications and does therefore not meet the requirements.</t>
<t>A lot of network resources are spent on setting up separate sessions for each participant.</t>
</section>
<section title="Mixing for conference-unaware user agents">
<t>Multi-party real-time text contents can be transmitted to
conference-unaware user agents if source labeling and formatting of the
text is performed by a mixer. This method has the limitations that the
layout of the presentation and the format of source identification is purely controlled by the mixer, and
that only one source at a time is allowed to present in real-time. Other
sources need to be stored temporarily waiting for an appropriate moment
to switch the source of transmitted text. The mixer controls the switching
of sources and inserts a source identifier in text format at the beginning of text after switch of source.
The logic of trhe mixer to detect when a switch is appropriate should detect
a number of places in text where a switch can be allowed, including new line,
end of sentence, end of phrase, a period of inactivity, and a word separator after a long time
of active transmission.</t>
<t>This method MAY be used when no support for multi-party awareness is detected in the
receiving endpoint.The base for his method is described in RFC 7667,
section 3.6.2 Media mixing mixer.</t>
<t> See Appendix A for an informative example of a procedure for presenting RTT to a conference-unaware UA.</t>
<t>Pros:</t>
<t> Can be transmitted to conference-unaware endpoints.</t>
<t>Can be used with other transports than RTP</t>
<t>Cons:</t>
<t> Does not allow full real-time presentation of more than one source at a time. Text from other sources will
be delayed, even if automatic detection of suitable moments for switching source for presentation is made by the mixer.</t>
<t>The only realistic presentation format is a style with the text from the different sources presented with a text label
indicating source, and the text collected in a chat style presentation but with more frequent turn-taking.</t>
<t>Endpoints often have their own system for adding labels to the RTT presentation. In that case there will be two levels of labels in the presentation, one for the mixer and one for the sources.</t>
<t>If loss of more packets than can be recovered by the redundancy appears, it is not possible to detect which source
was struck by the loss. It is also possible that a source switch occurred during the loss, and therefore a false indication
of the source of text can be provided to the user after such loss.</t>
<t>Because of all these cons, this method MUST NOT be used as the main method, but only as the last resort
for backwards interoperability with conference-unaware endpoints. </t>
<t>The conference server need to be allowed to decrypt/encrypt the packet payload.</t>
</section>
</section>
<section title="RTT bridging in WebRTC">
<t>Within WebRTC, real-time text is specified to be carried in WebRTC data
channels as specified in draft-ietf-mmusic-t140-usage-data-channel. A few ways to handle multi-party
RTT are mentioned briefly. They are explained and further detailed below.</t>
<section title="RTT bridging in WebRTC with one data channel per source">
<t>
A straightforward way to handle
multi-party RTT is for the bridge to open one T.140 data channel per source towards the receiving participants.
</t>
<t>The stream-id forms a unique stream identification.</t>
<t>The identification of the source is made through the Label property of the channel,
and session information belonging to the source. The UA can compose a readable label for the presentation from this information.</t>
<t>Pros: </t>
<t> This is a straightforward solution. </t>
<t>Cons: </t>
<t>With a high number of participants, the overhead of establishing the high number of data channels required may be high.</t>
</section>
<section title="RTT bridging in WebRTC with one common data channel">
<t>
A way to handle
multi-party RTT in WebRTC is for the bridge combine text from all sources into one data
channel and insert the sources in the stream by a T.140 control code for source.</t>
<t>This method is described in a corresponding section for RTP transmission above. </t>
<t>The identification of the source is made through insertion in the beginning of each text transmission
from a source of a control code extension "c" followed by a string representing
the source, framed by the control code start and end flags SOS and ST
(See <xref target="T.140">ITU-T T.140</xref>).</t>
<t> A receiving UA is supposed to separate text items from the different sources and
identify and display them in a suitable way.</t>
<t>The UA does not always display the source identification in the received text
at the place where it is received, but has the information as a guide for
planning the presentation of received text. A label corresponding to the
source identification is presented when needed depending on the selected
presentation style.</t>
<t>Pros:</t>
<t> This solution has relatively low overhead on session and network level </t>
<t>Cons:</t>
<t> This solution has higher overhead on the media contents level than the WebRTC solution above.</t>
<t>Standardisation of the new control code "c" in ITU-T T.140 is required.</t>
<t>The conference server need to be allowed to decrypt/encrypt the data channel contents.</t>
</section>
</section>
<section title="Preferred multi-party RTT transport method">
<t>EDITOR NOTE: The recommendations here need to be validated, and the proposed further studies performed.</t>
<t>For RTP transport of RTT, two methods for multi-party mixing and transport for conference-aware parties
stand out as fulfilling the goals best is: "RTP Mixer indicating participants in CSRC".</t>
<t>For WebRTC, one method is to prefer because of the simplicity.
So, for WebRTC, the method to implement for multi-party RTT with
conference-aware parties when no other method is explicitly agreed between
implementing parties is: "RTT bridging in WebRTC with one data channel per source".</t>
</section>
<section title="Session control of multi-party RTT sessions">
<t>General session control aspects for multi-party sessions are
described in <xref target="RFC4575">RFC 4575</xref> A Session
Initiation Protocol (SIP) Event Package for Conference State, and
<xref target="RFC4579">RFC 4579</xref> Session Initiation Protocol
(SIP) Call Control - Conferencing for User Agents. The nomenclature of
these specifications are used here.</t>
<t>The procedures for a conference-aware model for RTT-transmission shall only be applied if a
capability exchange for conference-aware real-time text transmission has
been completed and a supported method for multi-party real-time text transmission can be identified.</t>
<t>A method for detection of conference-awareness for centralized SIP conferencing in general is
specified in <xref target="RFC4579">RFC 4579</xref>. The focus sends the "isfocus" feature tag in a
SIP Contact header. This causes
the conference-aware UA to subscribe to conference notifications from the focus. The focus then sends
notifications to the UA about entering and disappearing conference participants and their media capabilities.
The information is carried XML-formatted in a 'conference-info' block in the notification according to RFC 4575.
The mechanism is described in detail in <xref target="RFC4575">RFC 4575</xref>. </t>
<t>Before a conference media server starts sending multi-party RTT to a UA, a verification of its ability
to handle multi-party RTT must be made. A decision on which mechanism to use for identifying text from the
different participants must also be taken, implicitly or explicitly. These verifications and decisions can
be done in a number of ways. The most apparent ways are specified here and their pros and cons described.
One of the methods is selected to be the
one to be used by implementations according to this specification.</t>
<section title="Implicit RTT multi-party capability indication">
<t>
Capability for RTT multi-party handling can be decided to be implicitly indicated by session control items. </t>
<t>The focus may implicitly indicate muti-party RTT capability by including the media child with value "text"
in the RFC 4575 conference-info provided in conference notifications.</t>
<t>A UA may implicitly indicate multi-party RTT capability by including the text media in the SDP in the session
control transactions with the conference focus after the subscription to the conference has taken place. </t>
<t>The implicit RTT capability indication means for the focus that it can handle multi-party RTT according to
the preferred method indicated in the RTT multi-party methods section above.</t>
<t>The implicit RTT capability indication means for the UA that it can handle multi-party RTT according to
the preferred method indicated in the RTT multi-party methods section above.</t>
<t>If the focus detects that a UA implicitly declared RTT multi-party capability, it SHALL provide RTT
according to the preferred method.</t>
<t>If the focus detects that the UA does not indicate any RTT multi-party capability, then it shall either provide
RTT multi-party text in the way specified for conference-unaware UA above, or refuse to set up the session.</t>
<t>If the UA detects that the focus has implicitly declared RTT multi-party capability, it shall be prepared
to present RTT in a multi-party fashion according to the preferred method.</t>
<t>Pros: </t>
<t>Acceptance of implicit multi-party capability implies that no standardisation of explicit RTT
multi-party capability exchange is required.</t>
<t>Cons: </t>
<t>If other methods for multi-party RTT are to be used in the same implementation environment
as the preferred ones,then capability exchange needs to be defined for them. </t>
<t>Cannot be used outside a strictly applied SIP central conference model.</t>
</section>
<section title="RTT multi-party capability declared by SIP media-tags">
<t>Specifications for RTT multi-party capability declarations can be agreed for use as SIP media feature tags,
to be exchanged during SIP call control operation according to the mechanisms in RFC 3840 and RFC 3841.
Capability for the RTT Multi-party capability is then indicated by the media feature tag "rtt-mixer", with one
or more of its possible values in a comma-separated list.</t>
<t>The possible values in the list are:</t>
<t>
<list style="empty">
<t>rtp-translator</t>
<t>rtp-mixer</t>
<t>t140-mixer</t>
<t>rtp-mesh</t>
<t>multi-session</t>
</list></t>
<t>rtp-translator indicates capability for using the RTP-translator
based coordination of multi-party text.</t>
<t>rtp-mixer indicates capability for using the RTP-mixer based
presentation of multi-party text.</t>
<t> t140-mixer indicates capability for
using the T.140 control code source indicators in a mixer. </t>
<t>text-mixer
indicates capability for using the fallback method with text
formatting for conference-unaware endpoints.</t>
<t>rtp-mesh indicates capability for using the mesh
based transmission of multi-party text.
</t>
<t>multi-session indicates capability for using separate point-to-point
RTP sessions between all participants.</t>
<t>Example: Contact: <sip:[email protected]></t>
<t>;methods="INVITE,ACK,OPTIONS,BYE,CANCEL"</t>
<t>;+sip.rtt-mixer="multi-session"</t>
<t>If, after evaluation of the alternatives in this specification, only one mixing
method is selected to be brought to implementation, then the media tag can
be reduced to a single tag with no list of values. </t>
<t></t>
<t>An offer-answer exchange should take place and the common method selected by the
answering party shall be used in the session with that UA.</t>
<t>When no common method is declared, then only the fallback method can be used or the session dropped.</t>
<t>If more than one text media line is included in SDP, all must be capable of using
the declared RTT multi-party method.</t>
<t>Pros:</t>
<t> Provides a clear decision method.</t>
<t>Can be extended with new mixing methods.</t>
<t>Can guide call routing to a suitable capable focus.</t>
<t>Cons:</t>
<t> Requires standardization and IANA registration.</t>
<t> Is not stream specific. If more than one text stream
is specified, all must have the same type of multi-party capability.</t>
<t>Cannot be used in the WebRTC environment.</t>
</section>
<section title="SDP media attribute for RTT multi-party capability indication">
<t>An attribute can be specified on media level, to be used in text media SDP
declarations for negotiating RTT multi-party capabilities.
The attribute can have the name "rtt-mixer", with one
or more of its possible values in a comma-separated list.</t>
<t>The possible values in the list are:</t>
<t>
<list style="empty">
<t>rtp-translator</t>
<t>rtp-mixer</t>
<t>t140-mixer</t>
<t>rtp-mesh</t>
<t>multi-session</t>
</list></t>
<t>rtp-translator indicates capability for using the RTP-translator
based coordination of multi-party text.</t>
<t>rtp-mixer indicates capability for using the RTP-mixer based
presentation of multi-party text.</t>
<t> t140-mixer indicates capability for
using the T.140 control code source indicators in a mixer. </t>
<t>text-mixer
indicates capability for using the fallback method with text
formatting for conference-unaware endpoints.</t>
<t>rtp-mesh indicates capability for using the mesh
based transmission of multi-party text.
</t>
<t>multi-session indicates capability for using separate point-to-point
RTP sessions between all participants.</t>
<t></t>
<t>An offer-answer exchange should take place and the common method selected by the
answering party shall be used in the session with that UA.</t>
<t>When no common method is declared, then only the fallback method can be used.</t>
<t> Example: a=rtt-mixer:rtp-mixer</t>
<t>If, after evaluation of the alternatives in this specification, only one mixing
method is selected to be brought to implementation, then the attribute can
be reduced to a single attribute with no list of values. </t>
<t>Pros:</t>
<t> Provides a clear decision method.</t>
<t>Can be extended with new mixing methods.</t>
<t>Can be used on specific text media.</t>
<t>Can be used also for SDP-controlled WebRTC sessions with multiple streams in the same data channel.</t>
<t>Cons:</t>
<t> Requires standardization and IANA registration.</t>
<t>Cannot guide SIP routing.</t>
</section>
<section title="SDP format parameter for RTT multi-party capability indication">
<t>An FMTP format parameter can be specified for the RFC 4103 media, to be used in text media SDP
declarations for negotiating RTT multi-party capabilities.
The parameter can have the name "rtt-mixer", with one
or more of its possible values in a comma-separated list.</t>
<t>The possible values in the list are:</t>
<t>
<list style="empty">
<t>rtp-translator</t>
<t>rtp-mixer</t>
<t>t140-mixer</t>
<t>rtp-mesh</t>
<t>multi-session</t>
</list></t>
<t>rtp-translator indicates capability for using the RTP-translator
based coordination of multi-party text.</t>
<t>rtp-mixer indicates capability for using the RTP-mixer based
presentation of multi-party text.</t>
<t> t140-mixer indicates capability for
using the T.140 control code source indicators in a mixer. </t>
<t>text-mixer
indicates capability for using the fallback method with text
formatting for conference-unaware endpoints.</t>
<t>rtp-mesh indicates capability for using the mesh
based transmission of multi-party text.
</t>
<t>multi-session indicates capability for using separate point-to-point
RTP sessions between all participants.</t>
<t>Example: a=fmtp 96 98/98/98 cps=30;rtt-mixer=rtp-mixer </t>
<t>If, after evaluation of the alternatives in this specification, only one mixing
method is selected to be brought to implementation, then the parameter can
be reduced to a single parameter with no list of values. </t>
<t></t>
<t>An offer-answer exchange should take place and the common method selected by the
answering party shall be used in the session with that UA.</t>
<t>When no common method is declared, then only the fallback method can be used.</t>
<t>Pros:</t>
<t> Provides a clear decision method.</t>
<t>Can be extended with new mixing methods.</t>
<t>Can be used on specific text media.</t>
<t>Can be used also for SDP-controlled WebRTC sessions with multiple streams in the same data channel.</t>
<t>Cons:</t>
<t> Requires standardization and IANA registration.</t>
<t> May cause interop problems with current RFC4103 implementations not expecting a new fmtp-parameter. </t>
<t>Cannot guide SIP routing.</t>
</section>
<section title="Preferred capability declaration method.">
<t>The preferred capability declaration method is the one with SDP attributes because it is straightforward and partially
usable also for WebRTC.
</t>
</section>
</section>
<section title="Identification of the source of text">
<t>EDITOR NOTE: The text in the following sections need to be adapted after
recommendations for the main methods for coordination of RTT has been selected.
Details should be provided mainly for the recommended method.</t>
<t>The main way to identify the source of text in the RTP based solution is by
the SSRC of the sending participant. It is included in the CSRC list of the transmitted packets.
Further identification that may be needed for better labeling of received text may be achieved from a number of sources.
It may be the RTCP SDES CNAME and NAME reports, and in the conference notification data (RFC 4575).</t>
<t>As soon as a new member is added to the RTP session, its
characteristics should be transmitted in RTCP SDES CNAME and NAME reports
according to section 6.5 in RFC 3550. The information about the participant
should also be included in the conference data including the text media member
in a notification according to RFC 4575.</t>
<t>The RTCP SDES report, SHOULD contain identification of the source
represented by the SSRC/CSRC identifier. This identification MUST contain the
CNAME field and MAY contain the NAME field and other defined fields of
the SDES report.</t>
<t>A focus UA SHOULD primarily convey SDES information received from the
sources of the session members. When such information is not available,
the focus UA SHOULD compose SSRC/CSRC, CNAME and NAME information from
available information from the SIP session with the participant.</t>
</section>
<section title="Presentation of multi-party text">
<t>All session participants MUST observe the SSRC/CSRC field of incoming text
RTP packets, and make note of what source they came from in order to be
able to present text in a way that makes it easy to read text from each
participant in a session, and get information about the source of the
text.</t>
<section title="Associating identities with text streams">
<t>A source identity SHOULD be composed from available information
sources and displayed together with the text as indicated in ITU-T
T.140 Appendix<xref target="T.140"> </xref>.</t>
<t>The source identity should primarily be the NAME field from incoming SDES
packets. If this information is not available, and the session is a
two-party session, then the T.140 source identity SHOULD be composed
from the SIP session participant information. For multi-party sessions
the source identity may be composed by local information if sufficient
information is not available in the session.</t>
<t>Applications may abbreviate the presented source identity to a
suitable form for the available display.</t>
</section>
<section title="Presentation details for multi-party aware UAs.">
<t>The multi-party aware UA should after any action for recovery of data from
lost packets, separate the incoming streams and present them according
to the style that the receiving application supports and the user has selected.
The decisions taken for presentation of the multi-party interchange shall be purely
on the receiving side. The sending application must not insert any item in the stream
to influence presentation that is not requested by the sending participant.</t>
<section title="Bubble style presentation">
<t>One often used style is to present real-time text in chunks in readable bubbles
identified by labels containing names of sources. Bubbles are placed in one column in the presentation area
and are closed and moved upwards in the presentation area after certain items or
events, when there is also newer text from another source that would go into a new bubble.
The text items that allows bubble closing are any character closing a phrase or sentence
followed by a space or a timeout of a suitable time (about 10 seconds).</t>
<t>Real-time active text sent from the local user should be presented in a separate area. When there is a reason
to close a bubble from the local user, the bubble should be placed above all real-time active bubbles,
so that the time order that real-time text entries were completed is visible.</t>
<t>Scrolling is usually provided for viewing of recent or older text. When scrolling is done to an
earlier point in the text, the presentation shall not move the scroll position by new received text.
It must be the decision of the local user to return to automatic viewing of latest text actions.
It may be useful with an indication that there is new text to read after scrolling to an earlier
position has been activated.</t>
<t>The presentation area may become too small to present all text in all real-time active bubbles. Various
techniques can be applied to provide a good overview and good reading opportunity even in such situations.
The active real-time bubble may have a limited number of lines and if their contents need more lines,
then a scrolling opportunity within the real-time active bubble is provided. Another method can be to only
show the label and the last line of the active real-time bubble contents, and make it possible to expand or
compress the bubble presentation between full view and one line view. </t>
<t> Erasures require special consideration. Erasure within a real-time active bubble is straightforward.
But if erasure from one participant affects the last character before a bubble, the
whole previous bubble becomes the actual bubble for real-time action by that participant
and is placed below all other bubbles in the presentation area. If the border between bubbles
was caused by the CRLF characters, only one erasure action is required to erase this bubble border.
When a bubble is closed, it is moved up, above all real-time active bubbles.</t>
</section>
<section title="Other presentation styles">
<t>Other presentation styles than the bubble style may be arranged and appreciated by the users.
In a video conference one way may be to have a real-time text area below the video view of each participant.
Another view may be to provide one column in a presentation area for each participant and place the text entries
in a relative vertical position corresponding to when text entry in them was completed. The labels can then be placed
in the column header. The considerations for ending and moving and erasure of entered text discussed above for the bubble
style are valid also for these styles. </t>
</section>
</section>
</section>
<section title="Presentation details for multi-party unaware UAs.">
<t>Multi-party unaware UA:s are prepared only for presentation of two sources of text, the local user and a remote user. In order to enable some multi-party communication with such UA, the mixer need to plan the presentation and insert labels and line breaks before lables. Many limitations appear for this presentation mode, and it must be seen as a fallback and a last resort.
</t>
<t> See Appendix A for an informative example of a procedure for presenting RTT to a conference-unaware UA.</t>
</section>
<section title="Transmission of text from each user">
<t>UAs participating in sessions with real-time text, SHOULD send SDES
packets in RTCP giving values to appropriate identification fields.</t>
<t>The CNAME field SHALL be included in SDES packets.</t>
<t>The NAME field should be given a value that is suitable as an
identifier of text from the user of the UA.</t>
</section>
<section title="Robustness and indication of possible loss">
<t>This section discusses the means for robustness against loss of text
that is already specified and their performance in the multi-party situation.
means for reducing the risk for loss is discussed, as well as ways to detect
in which stream loss has occurred.</t>
<t>TBD</t>
</section>
<section title="Performance">
<t>This section discusses performance and performance limitations for the different
transport solutions, and indicates which means for performance increase versus
load limitations can be suitable to apply compared to the point-to-point case.</t>
<t>TBD</t>
</section>
<section anchor="Security" title="Security Considerations">
<t>The security considerations valid for RFC 4103 and RFC 3550 are valid
also for the multi-party sessions with text.</t>
</section>
<section anchor="IANA" title="IANA Considerations">
<t>EDITOR NOTE: TBD after decision of proposed preferences in the draft.</t>
<t>This document Introduces the TBD /SIP media tag/SDP media level attribute/ rtt-mixer, with a
comma-separated parameter list containing the following possible
values:</t>
<t>
<list style="empty">
<t>rtp-translator</t>
<t>rtp-mixer</t>
<t>t140-mixer</t>
<t>rtp-mesh</t>
<t>multi-session</t>
</list>
</t>
<t>rtp-translator indicates capability for using the RTP-translator
based coordination of multi-party text.</t>
<t>rtp-mixer indicates capability for using the RTP-mixer based
presentation of multi-party text.</t>
<t> t140-mixer indicates capability for
using the T.140 control code source indicators in a mixer. </t>
<t>text-mixer
indicates capability for using the fallback method with text
formatting for conference-unaware endpoints.</t>
<t>rtp-mesh indicates capability for using the mesh
based transmission of multi-party text.
</t>
<t>multi-session indicates capability for using separate point-to-point
RTP sessions between all participants.</t>
</section>
<section title="Congestion considerations">
<t>The congestion considerations described in RFC 4103 are valid also
for multi-party use of the real-time text RTP transport. A risk for
congestion may appear if a number of conference participants are active
transmitting text simultaneously, because this multi-party transmission
method does not allow multiple sources of text to contribute to the same
packet.</t>
<t>In situations of risk for congestion, the Focus UA MAY combine
packets from the same source to increase the transmission interval per
source up to one second. Local conference policy in the Focus UA may be
used to decide which streams shall be selected for such transmission
frequency reduction.</t>
</section>
<section title="Acknowledgements">
<t>Arnoud van Wijk for contributions to an earlier, expired draft of this memo.</t>
</section>
</middle>
<!-- *****BACK MATTER ***** -->
<back>
<!-- References split into informative and normative -->
<!-- There are 2 ways to insert reference entries from the citation libraries:
1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
(for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")
Both are cited textually in the same manner: by using xref elements.
If you use the PI option, xml2rfc will, by default, try to find included files in the same
directory as the including file. You can also define the XML_LIBRARY environment variable
with a value containing a set of directories to search. These can be either in the local
filing system or remote ones accessed by http (http://domain/dir/... ).-->
<references title="Normative References">
<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml"?-->
<!-- <?rfc include="http://xml.resource.org/public/rfc/bibxml3/reference.I-D.hellstrom-textpreview.xml"?> -->
&RFC2119;
&RFC3261;
&RFC3550;
&RFC4103;
&RFC4575;
&RFC4579;
<reference anchor="T.140" target="http://www.itu.int/rec/T-REC-T.140/en">
<front>
<title>Protocol for multimedia application text conversation</title>
<author surname="ITU-T">
<organization></organization>
</author>
<date year="1998" />
</front>
</reference>
</references>
<references title="Informative References">