Coverage for src / crawler / by_source / nsjom / nsjom_xml_crawler.py: 86%

122 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import re 

2import typing 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import ( 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12 create_publisherdata, 

13 create_subj, 

14) 

15 

16from crawler.utils import add_pdf_link_to_xarticle 

17 

18if typing.TYPE_CHECKING: 

19 from .nsjom_crawler import NsjomCrawler 

20 

21source_domain = "NSJOM" 

22 

23 

24def parse_collection_content( 

25 self: "NsjomCrawler", 

26 _: str, 

27 source_domain: str = "NSJOM", 

28 xissue_pid_to_parse: str | None = None, 

29): 

30 """ 

31 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

32 From 2015 to today 

33 """ 

34 xissues: dict[tuple[str, str], IssueData] = {} 

35 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml" 

36 content = self.download_file(url) 

37 soup = BeautifulSoup(content, "lxml-xml") 

38 record_container_element = soup.select_one("records") 

39 if record_container_element is None: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 raise ValueError(f"[{source_domain}] Cannot parse source") 

41 for record_element in record_container_element.select("record"): 

42 publication_type_tag = record_element.select_one("publicationType") 

43 if publication_type_tag is None: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 raise ValueError(f"[{source_domain}] Cannot determine article publicationType") 

45 if publication_type_tag.text != "published": 

46 continue 

47 year_tag = record_element.select_one("year") 

48 if year_tag is None or year_tag.text == "": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError(f"[{source_domain}] Cannot parse year from article") 

50 year = int(year_tag.text) 

51 xarticle, volume_number, issue_number = parse_article( 

52 self, record_element, source_domain=source_domain 

53 ) 

54 if (volume_number, issue_number) not in xissues: 

55 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}" 

56 if xissue_pid_to_parse and xissue_pid_to_parse != pid: 

57 continue 

58 xissue = create_issuedata() 

59 parse_issue_tag(xissue, record_element, year) 

60 xissue.year = year_tag.text 

61 xissue.volume = volume_number 

62 xissue.number = issue_number 

63 xissue.pid = pid 

64 xissues[(volume_number, issue_number)] = xissue 

65 xissues[(volume_number, issue_number)].articles.append(xarticle) 

66 

67 return list(xissues.values()) 

68 

69 

70def parse_issue_content(self: "NsjomCrawler", content: str, xissue: IssueData): 

71 if not xissue.year: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raise ValueError("Issue year is not set") 

73 return parse_collection_content(self, content, source_domain, xissue.pid) 

74 

75 

76def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData: 

77 publisher_tag = article_tag.select_one("publisher") 

78 if publisher_tag: 78 ↛ 83line 78 didn't jump to line 83 because the condition on line 78 was always true

79 xpub = create_publisherdata() 

80 xpub.name = publisher_tag.text 

81 xissue.publisher = xpub 

82 

83 ext_link = create_extlink( 

84 rel="source", 

85 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

86 metadata=source_domain, 

87 ) 

88 xissue.ext_links.append(ext_link) 

89 return xissue 

90 

91 

92def parse_article(self: "NsjomCrawler", article_tag: Tag, source_domain: str = "NSJOM"): 

93 xarticle = create_articledata() 

94 

95 doi_tag = article_tag.select_one("doi") 

96 if doi_tag is None: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError(f"[{source_domain}] : Article doi not found") 

98 xarticle.doi = doi_tag.text 

99 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text) 

100 

101 page_start_tag = article_tag.select_one("startPage") 

102 page_end_tag = article_tag.select_one("endPage") 

103 if page_start_tag: 103 ↛ 105line 103 didn't jump to line 105 because the condition on line 103 was always true

104 xarticle.fpage = page_start_tag.text 

105 if page_end_tag: 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true

106 xarticle.lpage = page_end_tag.text 

107 

108 date_published_tag = article_tag.select_one("publicationDate") 

109 if date_published_tag: 109 ↛ 112line 109 didn't jump to line 112 because the condition on line 109 was always true

110 xarticle.date_published_iso_8601_date_str = date_published_tag.text 

111 

112 url_tag = article_tag.select_one("publisherRecordId") 

113 if url_tag: 113 ↛ 121line 113 didn't jump to line 121 because the condition on line 113 was always true

114 ext_link = create_extlink( 

115 rel="source", 

116 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}", 

117 metadata=source_domain, 

118 ) 

119 xarticle.ext_links.append(ext_link) 

120 

121 title_tag = article_tag.select_one("title") 

122 if title_tag: 122 ↛ 127line 122 didn't jump to line 127 because the condition on line 122 was always true

123 xarticle.title_tex = title_tag.text 

124 

125 # TODO : Affiliations ? 

126 

127 authors_container = article_tag.select_one("authors") 

128 if authors_container: 128 ↛ 142line 128 didn't jump to line 142 because the condition on line 128 was always true

129 for author_tag in authors_container.select("author"): 

130 author = create_contributor(role="author") 

131 author_name_tag = author_tag.select_one("name") 

132 if author_name_tag: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was always true

133 author["string_name"] = author_name_tag.text 

134 corresponding = author_tag.get("corresponding") 

135 if corresponding == "1": 

136 author["corresponding"] = True 

137 email_tag = author_tag.select_one("email") 

138 if email_tag: 

139 author["email"] = email_tag.text 

140 xarticle.contributors.append(author) 

141 

142 abstract_tag = article_tag.select_one("abstract") 

143 if abstract_tag: 143 ↛ 154line 143 didn't jump to line 154 because the condition on line 143 was always true

144 abstract_language = abstract_tag.get("langauge", None) 

145 if abstract_language is None or isinstance(abstract_language, list): 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true

146 abstract_language = "eng" 

147 xarticle.abstracts.append( 

148 create_abstract( 

149 value_tex=abstract_tag.text, 

150 lang=abstract_language or self.detect_language(abstract_tag.text) or "und", 

151 ) 

152 ) 

153 

154 keywords_tag = article_tag.select_one("keywords") 

155 if keywords_tag: 155 ↛ 165line 155 didn't jump to line 165 because the condition on line 155 was always true

156 keywords_language = keywords_tag.get("language", "eng") 

157 if keywords_language is None or isinstance(keywords_language, list): 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 keywords_language = "eng" 

159 for kwd_tag in keywords_tag.select("keyword"): 

160 subject = create_subj() 

161 subject["value"] = kwd_tag.text 

162 subject["lang"] = "en" 

163 xarticle.kwds.append(subject) 

164 

165 msc_tag = article_tag.select_one("MSCs") 

166 if msc_tag: 166 ↛ 174line 166 didn't jump to line 174 because the condition on line 166 was always true

167 for msc_subj in msc_tag.select("MSC"): 

168 subject = create_subj() 

169 subject["value"] = msc_subj.text 

170 subject["type"] = "msc" 

171 subject["lang"] = "en" 

172 xarticle.kwds.append(subject) 

173 

174 pdf_location_tag = article_tag.select_one("filelocation") 

175 pdf_name_tag = article_tag.select_one("file") 

176 if pdf_location_tag and pdf_name_tag: 

177 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text 

178 add_pdf_link_to_xarticle(xarticle, pdf_url) 

179 

180 volume_tag = article_tag.select_one("volume") 

181 issue_tag = article_tag.select_one("issue") 

182 if volume_tag is None or issue_tag is None: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 raise ValueError( 

184 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article" 

185 ) 

186 

187 # Citations ? 

188 

189 return xarticle, volume_tag.text, issue_tag.text