Coverage for src/crawler/cmds/mixed_citation.py: 33%

53 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1""" 

2Helpers to create an xml string for mixed citations 

3 

4```py 

5citation_builder = MixedCitation() 

6 

7citation_builder.label = "[1]" 

8 

9persongroup = GenericXMLTemplate() 

10persongroup.name = "person-group" 

11persongroup.elements.append("Michel B.") 

12citation_builder.elements.append(persongroup) 

13 

14citation_builder.elements.append("untagged string to put between") 

15 

16year_builder = Year() 

17year.elements.append("2020") 

18citation_builder.elements.append(year) 

19 

20mixedcitation.get_jats_ref() 

21``` 

22""" 

23 

24from lxml import etree 

25from lxml.builder import E 

26from lxml.etree import Element 

27from ptf.cmds.xml.jats.jats_parser import JatsBase 

28from ptf.cmds.xml.xml_utils import escape 

29from ptf.display.resolver import extids_formats, reverse_extids_hrefs 

30from ptf.model_data import RefData 

31 

32 

33class GenericRefElement: 

34 name: str 

35 elements: list["str | GenericRefElement"] 

36 attributes: dict[str, str] 

37 

38 def __init__(self) -> None: 

39 super().__init__() 

40 self.elements = [] 

41 self.attributes = {} 

42 

43 def get_xml_element(self): 

44 xml_elements: list[str | Element] = [] 

45 

46 for item in self.elements: 

47 if isinstance(item, str): 

48 xml_elements.append(item) 

49 else: 

50 xml_elements.append(item.get_xml_element()) 

51 

52 return E(self.name, *xml_elements, **self.attributes) 

53 

54 def get_xml_string(self): 

55 """ 

56 Make sure the hierarchy of the objects (self.elements) is valid JATS 

57 before calling this function 

58 """ 

59 return etree.tostring(self.get_xml_element(), pretty_print=True).decode() 

60 

61 

62def find_extlink(url: str): 

63 for pattern in reverse_extids_hrefs: 

64 if url.startswith(pattern): 

65 type = reverse_extids_hrefs[pattern] 

66 content = url.removeprefix(pattern) 

67 return type, content 

68 return None, None 

69 

70 

71class ExtLinkXml(GenericRefElement): 

72 name = "ext-link" 

73 

74 def __init__(self, url: str, content: str | None = None, type: str = "uri") -> None: 

75 super().__init__() 

76 

77 if not content: 

78 content = url 

79 

80 if type not in extids_formats: 

81 type_found, content_found = find_extlink(url) 

82 type = type_found or type 

83 content = content_found or content 

84 

85 self.attributes["ext-link-type"] = type 

86 self.attributes["href"] = url 

87 

88 self.elements.append(escape(content)) 

89 

90 

91class MixedCitation(GenericRefElement): 

92 name = "mixed-citation" 

93 label: str | None = None 

94 

95 def get_jats_ref(self): 

96 xref = RefData(lang="en") 

97 xref.citation_xml = self.get_xml_string() 

98 if self.label: 

99 xref.citation_xml = f"<label>{self.label}</label>" + xref.citation_xml 

100 return JatsBase.bake_ref(xref)