11import re
22from types import NoneType
3- from typing import List , Union
3+ from typing import Callable , List , Union
44
55from bs4 import BeautifulSoup , Comment , Doctype , NavigableString
66from pydantic import BaseModel
4545from jsondoc .models .page import Page
4646from jsondoc .models .shared_definitions import Annotations
4747from jsondoc .rules import is_block_child_allowed
48- from jsondoc .utils import generate_id , get_current_time
48+ from jsondoc .utils import generate_block_id , get_current_time
4949
5050line_beginning_re = re .compile (r"^" , re .MULTILINE )
5151whitespace_re = re .compile (r"[\t ]+" )
@@ -307,7 +307,9 @@ def reconcile_to_rich_text(
307307
308308
309309def reconcile_to_block (
310- block : BlockBase , children : List [CHILDREN_TYPE ]
310+ block : BlockBase ,
311+ children : List [CHILDREN_TYPE ],
312+ typeid : bool = False ,
311313) -> List [CHILDREN_TYPE ]:
312314 """
313315 Given a block and a list of children,
@@ -350,7 +352,7 @@ def reconcile_to_block(
350352 # Get corresponding field from the block
351353 block_field = getattr (block , block_type )
352354 init_kwargs = {
353- "id" : generate_id ( ),
355+ "id" : generate_block_id ( typeid = typeid ),
354356 "created_time" : child .created_time ,
355357 block_type : type (block_field )(),
356358 }
@@ -383,26 +385,20 @@ def reconcile_to_block(
383385
384386
385387class HtmlToJsonDocConverter (object ):
386- class DefaultOptions :
387- autolinks = True
388- code_language = ""
389- code_language_callback = None
390- convert = None
391- default_title = False
392- keep_inline_images_in = []
393- strip = None
394- force_page = False
395-
396- class Options (DefaultOptions ):
397- pass
388+ class Options (BaseModel ):
389+ autolinks : bool = True
390+ code_language : str = ""
391+ code_language_callback : Callable | None = None
392+ convert : Callable | None = None
393+ default_title : bool = False
394+ keep_inline_images_in : list [str ] = []
395+ strip : str | None = None
396+ force_page : bool = False
397+ typeid : bool = False
398398
399399 def __init__ (self , ** options ):
400- # Create an options dictionary. Use DefaultOptions as a base so that
401- # it doesn't have to be extended.
402- self .options = _todict (self .DefaultOptions )
403- self .options .update (_todict (self .Options ))
404- self .options .update (options )
405- if self .options ["strip" ] is not None and self .options ["convert" ] is not None :
400+ self .options = self .Options (** options )
401+ if self .options .strip is not None and self .options .convert is not None :
406402 raise ValueError (
407403 "You may specify either tags to strip or tags to convert, but not both."
408404 )
@@ -417,7 +413,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
417413 is_page = self ._is_soup_page (soup )
418414
419415 ret = None
420- if is_page or self .options [ " force_page" ] :
416+ if is_page or self .options . force_page :
421417 title = self ._get_html_title (soup )
422418 # Ensure that children is a list
423419 if not isinstance (children , list ):
@@ -427,6 +423,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
427423 ret = create_page (
428424 title = title ,
429425 children = children ,
426+ typeid = self .options .typeid ,
430427 )
431428 else :
432429 ret = children
@@ -526,7 +523,11 @@ def is_nested_node(el):
526523 if current_level_object is None :
527524 objects = children_objects
528525 elif isinstance (current_level_object , BlockBase ):
529- objects = reconcile_to_block (current_level_object , children_objects )
526+ objects = reconcile_to_block (
527+ current_level_object ,
528+ children_objects ,
529+ typeid = self .options .typeid ,
530+ )
530531 elif isinstance (current_level_object , RichTextBase ):
531532 objects = reconcile_to_rich_text (current_level_object , children_objects )
532533 else :
@@ -615,8 +616,8 @@ def process_text(self, el):
615616
616617 def should_convert_tag (self , tag ):
617618 tag = tag .lower ()
618- strip = self .options [ " strip" ]
619- convert = self .options [ " convert" ]
619+ strip = self .options . strip
620+ convert = self .options . convert
620621 if strip is not None :
621622 return tag not in strip
622623 elif convert is not None :
@@ -629,7 +630,7 @@ def convert_a(self, el, convert_as_inline):
629630 return ConvertOutput (main_object = create_rich_text (url = href ))
630631
631632 convert_b = abstract_inline_conversion (
632- lambda self : Annotations (bold = True ) # 2 * self.options[" strong_em_symbol"]
633+ lambda self : Annotations (bold = True ) # 2 * self.options. strong_em_symbol
633634 )
634635
635636 def convert_blockquote (self , el , convert_as_inline ):
@@ -646,7 +647,11 @@ def convert_blockquote(self, el, convert_as_inline):
646647 return ConvertOutput (main_object = create_rich_text ())
647648
648649 # TODO: If text has newlines, split them and add 2, 3, ... lines as children
649- return ConvertOutput (main_object = create_quote_block ())
650+ return ConvertOutput (
651+ main_object = create_quote_block (
652+ typeid = self .options .typeid ,
653+ )
654+ )
650655
651656 def convert_br (self , el , convert_as_inline ):
652657 if convert_as_inline :
@@ -683,40 +688,48 @@ def convert_h1(self, el, convert_as_inline):
683688 if convert_as_inline :
684689 return ConvertOutput (main_object = create_rich_text ())
685690
686- return ConvertOutput (main_object = create_h1_block ())
691+ return ConvertOutput (main_object = create_h1_block (typeid = self . options . typeid ))
687692
688693 def convert_h2 (self , el , convert_as_inline ):
689694 if convert_as_inline :
690695 return ConvertOutput (main_object = create_rich_text ())
691696
692- return ConvertOutput (main_object = create_h2_block ())
697+ return ConvertOutput (main_object = create_h2_block (typeid = self . options . typeid ))
693698
694699 def convert_h3 (self , el , convert_as_inline ):
695700 if convert_as_inline :
696701 return ConvertOutput (main_object = create_rich_text ())
697702
698- return ConvertOutput (main_object = create_h3_block ())
703+ return ConvertOutput (main_object = create_h3_block (typeid = self . options . typeid ))
699704
700705 def convert_h4 (self , el , convert_as_inline ):
701706 if convert_as_inline :
702707 return ConvertOutput (main_object = create_rich_text ())
703708
704- return ConvertOutput (main_object = create_paragraph_block ())
709+ return ConvertOutput (
710+ main_object = create_paragraph_block (typeid = self .options .typeid )
711+ )
705712
706713 def convert_h5 (self , el , convert_as_inline ):
707714 if convert_as_inline :
708715 return ConvertOutput (main_object = create_rich_text ())
709716
710- return ConvertOutput (main_object = create_paragraph_block ())
717+ return ConvertOutput (
718+ main_object = create_paragraph_block (typeid = self .options .typeid )
719+ )
711720
712721 def convert_h6 (self , el , convert_as_inline ):
713722 if convert_as_inline :
714723 return ConvertOutput (main_object = create_rich_text ())
715724
716- return ConvertOutput (main_object = create_paragraph_block ())
725+ return ConvertOutput (
726+ main_object = create_paragraph_block (typeid = self .options .typeid )
727+ )
717728
718729 def convert_hr (self , el , convert_as_inline ):
719- return ConvertOutput (main_object = create_divider_block ())
730+ return ConvertOutput (
731+ main_object = create_divider_block (typeid = self .options .typeid )
732+ )
720733
721734 convert_i = convert_em
722735
@@ -730,13 +743,14 @@ def convert_img(self, el, convert_as_inline):
730743 # title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
731744 if (
732745 convert_as_inline
733- and el .parent .name not in self .options [ " keep_inline_images_in" ]
746+ and el .parent .name not in self .options . keep_inline_images_in
734747 ):
735748 return alt
736749
737750 return ConvertOutput (
738751 main_object = create_image_block (
739752 url = src ,
753+ typeid = self .options .typeid ,
740754 # alt is not supported in JSON-DOC yet
741755 # caption=alt,
742756 )
@@ -755,28 +769,38 @@ def convert_list(self, el, convert_as_inline):
755769 def convert_li (self , el , convert_as_inline ):
756770 parent = el .parent
757771 if parent is not None and parent .name == "ol" :
758- return ConvertOutput (main_object = create_numbered_list_item_block ())
772+ return ConvertOutput (
773+ main_object = create_numbered_list_item_block (typeid = self .options .typeid )
774+ )
759775 else :
760- return ConvertOutput (main_object = create_bullet_list_item_block ())
776+ return ConvertOutput (
777+ main_object = create_bullet_list_item_block (typeid = self .options .typeid )
778+ )
761779
762780 def convert_p (self , el , convert_as_inline ):
763781 if convert_as_inline :
764782 return ConvertOutput (main_object = create_rich_text ())
765783
766- return ConvertOutput (main_object = create_paragraph_block ())
784+ return ConvertOutput (
785+ main_object = create_paragraph_block (typeid = self .options .typeid )
786+ )
767787
768788 def convert_pre (self , el , convert_as_inline ):
769789 text = el .get_text ()
770790
771791 if not text :
772792 return None
773793
774- code_language = self .options [ " code_language" ]
794+ code_language = self .options . code_language
775795
776- if self .options [ " code_language_callback" ] :
777- code_language = self .options [ " code_language_callback" ] (el ) or code_language
796+ if self .options . code_language_callback :
797+ code_language = self .options . code_language_callback (el ) or code_language
778798
779- return ConvertOutput (main_object = create_code_block (language = code_language ))
799+ return ConvertOutput (
800+ main_object = create_code_block (
801+ language = code_language , typeid = self .options .typeid
802+ )
803+ )
780804
781805 def convert_script (self , el , convert_as_inline ):
782806 return None
@@ -793,19 +817,19 @@ def convert_style(self, el, convert_as_inline):
793817 # Notion does not have an alternative for sub and sup tags
794818 convert_sub = abstract_inline_conversion (
795819 lambda self : Annotations ()
796- # self.options[" sub_symbol"] ,
820+ # self.options. sub_symbol,
797821 )
798822
799823 convert_sup = abstract_inline_conversion (
800824 lambda self : Annotations ()
801- # self.options[" sup_symbol"] ,
825+ # self.options. sup_symbol,
802826 )
803827
804828 def convert_table (self , el , convert_as_inline ):
805829 has_column_header = html_table_has_header_row (el )
806830 return ConvertOutput (
807831 main_object = create_table_block (
808- has_column_header = has_column_header ,
832+ has_column_header = has_column_header , typeid = self . options . typeid
809833 )
810834 )
811835
@@ -841,10 +865,15 @@ def convert_td(self, el, convert_as_inline):
841865 paragraph_block.rich_text will be extracted to form table_row.cells.
842866 """
843867 # Get colspan
844- colspan = el .get ("colspan" , 1 )
868+ colspan = el .get ("colspan" , "1" )
845869 # Get rowspan
846870 # rowspan = el.get("rowspan", 1)
847871 # We need to come up with a much different way to handle rowspan
872+ if not isinstance (colspan , int ):
873+ try :
874+ colspan = int (colspan )
875+ except ValueError :
876+ colspan = 1
848877
849878 next_objects = []
850879 if colspan > 1 :
@@ -863,7 +892,9 @@ def convert_tr(self, el, convert_as_inline):
863892 """
864893 Table row
865894 """
866- return ConvertOutput (main_object = create_table_row_block ())
895+ return ConvertOutput (
896+ main_object = create_table_row_block (typeid = self .options .typeid )
897+ )
867898
868899
869900def html_to_jsondoc (html : str | bytes , ** options ) -> Page | BlockBase | List [BlockBase ]:
0 commit comments