Coverage for src / crawler / cmds / mixed_citation.py: 93%

53 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1""" 

2Helpers to create an xml string for mixed citations 

3 

4 

5""" 

6 

7from lxml import etree 

8from lxml.builder import E 

9from lxml.etree import Element 

10from ptf.cmds.xml.jats.jats_parser import JatsBase 

11from ptf.cmds.xml.xml_utils import escape 

12from ptf.display.resolver import extids_formats, reverse_extids_hrefs 

13from ptf.model_data import RefData 

14 

15 

16class GenericRefElement: 

17 name: str 

18 elements: list["str | GenericRefElement"] 

19 attributes: dict[str, str] 

20 

21 def __init__(self) -> None: 

22 super().__init__() 

23 self.elements = [] 

24 self.attributes = {} 

25 

26 def get_xml_element(self): 

27 xml_elements: list[str | Element] = [] 

28 

29 for item in self.elements: 

30 if isinstance(item, str): 

31 xml_elements.append(item) 

32 else: 

33 xml_elements.append(item.get_xml_element()) 

34 

35 return E(self.name, *xml_elements, **self.attributes) 

36 

37 def get_xml_string(self): 

38 """ 

39 Make sure the hierarchy of the objects (self.elements) is valid JATS 

40 before calling this function 

41 """ 

42 return etree.tostring(self.get_xml_element(), pretty_print=True).decode() 

43 

44 

45def find_extlink(url: str): 

46 for pattern in reverse_extids_hrefs: 

47 if url.startswith(pattern): 

48 type = reverse_extids_hrefs[pattern] 

49 content = url.removeprefix(pattern) 

50 return type, content 

51 return None, None 

52 

53 

54class ExtLinkXml(GenericRefElement): 

55 """How to use : 

56 ```py 

57 url = "https://doi.org/10.1017/S0143385700002364" 

58 extlink = ExtLinkXml(url) 

59 citation_builder.elements.append(url) 

60 ``` 

61 """ 

62 

63 name = "ext-link" 

64 

65 def __init__(self, url: str, content: str | None = None, type: str = "uri") -> None: 

66 super().__init__() 

67 

68 if not content: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 content = url 

70 

71 if type not in extids_formats: 71 ↛ 76line 71 didn't jump to line 76 because the condition on line 71 was always true

72 type_found, content_found = find_extlink(url) 

73 type = type_found or type 

74 content = content_found or content 

75 

76 self.attributes["ext-link-type"] = type 

77 self.attributes["href"] = url 

78 

79 self.elements.append(escape(content)) 

80 

81 

82class MixedCitation(GenericRefElement): 

83 """ 

84 ```py 

85 citation_builder = MixedCitation() 

86 

87 citation_builder.label = "[1]" 

88 

89 persongroup = GenericXMLTemplate() 

90 persongroup.name = "person-group" 

91 persongroup.elements.append("Michel B.") 

92 citation_builder.elements.append(persongroup) 

93 

94 citation_builder.elements.append("untagged string to put between") 

95 

96 year_builder = Year() 

97 year.elements.append("2020") 

98 citation_builder.elements.append(year) 

99 

100 mixedcitation.get_jats_ref() 

101 ``` 

102 """ 

103 

104 name = "mixed-citation" 

105 label: str | None = None 

106 

107 def get_jats_ref(self): 

108 xref = RefData(lang="en") 

109 xref.citation_xml = self.get_xml_string() 

110 if self.label: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 xref.citation_xml = f"<label>{self.label}</label>" + xref.citation_xml 

112 return JatsBase.bake_ref(xref)