Coverage for src/crawler/by_source/nsjom/nsjom_xml_crawler.py: 85%

124 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2import typing 

3 

4from bs4 import BeautifulSoup, Tag 

5 

6# from ptf.model_data import create_publisherdata 

7from ptf.model_data import ( 

8 IssueData, 

9 create_articledata, 

10 create_contributor, 

11 create_extlink, 

12 create_issuedata, 

13 create_publisherdata, 

14 create_subj, 

15) 

16 

17from crawler.utils import add_pdf_link_to_xarticle 

18 

19if typing.TYPE_CHECKING: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true

20 from ..nsjom_crawler import NsjomCrawler 

21 

22source_domain = "NSJOM" 

23 

24 

25def parse_collection_content( 

26 self: "NsjomCrawler", 

27 _: str, 

28 source_domain: str = "NSJOM", 

29 xissue_pid_to_parse: str | None = None, 

30): 

31 """ 

32 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

33 From 2015 to today 

34 """ 

35 xissues: dict[tuple[str, str], IssueData] = {} 

36 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml" 

37 content = self.download_file(url) 

38 soup = BeautifulSoup(content, "lxml-xml") 

39 record_container_element = soup.select_one("records") 

40 if record_container_element is None: 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError(f"[{source_domain}] Cannot parse source") 

42 for record_element in record_container_element.select("record"): 

43 publication_type_tag = record_element.select_one("publicationType") 

44 if publication_type_tag is None: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError(f"[{source_domain}] Cannot determine article publicationType") 

46 if publication_type_tag.text != "published": 

47 continue 

48 year_tag = record_element.select_one("year") 

49 if year_tag is None or year_tag.text == "": 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError(f"[{source_domain}] Cannot parse year from article") 

51 year = int(year_tag.text) 

52 xarticle, volume_number, issue_number = parse_article( 

53 self, record_element, source_domain=source_domain 

54 ) 

55 if (volume_number, issue_number) not in xissues: 

56 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}" 

57 if xissue_pid_to_parse and xissue_pid_to_parse != pid: 

58 continue 

59 xissue = create_issuedata() 

60 parse_issue_tag(xissue, record_element, year) 

61 xissue.year = year_tag.text 

62 xissue.volume = volume_number 

63 xissue.number = issue_number 

64 xissue.pid = pid 

65 xissues[(volume_number, issue_number)] = xissue 

66 xissues[(volume_number, issue_number)].articles.append(xarticle) 

67 

68 return list(xissues.values()) 

69 

70 

71def parse_issue_content(self: "NsjomCrawler", content: str, xissue: IssueData): 

72 if not xissue.year: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise ValueError("Issue year is not set") 

74 return parse_collection_content(self, content, source_domain, xissue.pid) 

75 

76 

77def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData: 

78 publisher_tag = article_tag.select_one("publisher") 

79 if publisher_tag: 79 ↛ 84line 79 didn't jump to line 84 because the condition on line 79 was always true

80 xpub = create_publisherdata() 

81 xpub.name = publisher_tag.text 

82 xissue.publisher = xpub 

83 

84 ext_link = create_extlink( 

85 rel="source", 

86 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

87 metadata=source_domain, 

88 ) 

89 xissue.ext_links.append(ext_link) 

90 return xissue 

91 

92 

93def parse_article(self: "NsjomCrawler", article_tag: Tag, source_domain: str = "NSJOM"): 

94 xarticle = create_articledata() 

95 

96 doi_tag = article_tag.select_one("doi") 

97 if doi_tag is None: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 raise ValueError(f"[{source_domain}] : Article doi not found") 

99 xarticle.doi = doi_tag.text 

100 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text) 

101 

102 page_start_tag = article_tag.select_one("startPage") 

103 page_end_tag = article_tag.select_one("endPage") 

104 if page_start_tag: 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true

105 xarticle.fpage = page_start_tag.text 

106 if page_end_tag: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was always true

107 xarticle.lpage = page_end_tag.text 

108 

109 date_published_tag = article_tag.select_one("publicationDate") 

110 if date_published_tag: 110 ↛ 113line 110 didn't jump to line 113 because the condition on line 110 was always true

111 xarticle.date_published_iso_8601_date_str = date_published_tag.text 

112 

113 url_tag = article_tag.select_one("publisherRecordId") 

114 if url_tag: 114 ↛ 122line 114 didn't jump to line 122 because the condition on line 114 was always true

115 ext_link = create_extlink( 

116 rel="source", 

117 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}", 

118 metadata=source_domain, 

119 ) 

120 xarticle.ext_links.append(ext_link) 

121 

122 title_tag = article_tag.select_one("title") 

123 if title_tag: 123 ↛ 128line 123 didn't jump to line 128 because the condition on line 123 was always true

124 xarticle.title_tex = title_tag.text 

125 

126 # TODO : Affiliations ? 

127 

128 authors_container = article_tag.select_one("authors") 

129 if authors_container: 129 ↛ 143line 129 didn't jump to line 143 because the condition on line 129 was always true

130 for author_tag in authors_container.select("author"): 

131 author = create_contributor(role="author") 

132 author_name_tag = author_tag.select_one("name") 

133 if author_name_tag: 133 ↛ 135line 133 didn't jump to line 135 because the condition on line 133 was always true

134 author["string_name"] = author_name_tag.text 

135 corresponding = author_tag.get("corresponding") 

136 if corresponding == "1": 

137 author["corresponding"] = True 

138 email_tag = author_tag.select_one("email") 

139 if email_tag: 

140 author["email"] = email_tag.text 

141 xarticle.contributors.append(author) 

142 

143 abstract_tag = article_tag.select_one("abstract") 

144 if abstract_tag: 144 ↛ 156line 144 didn't jump to line 156 because the condition on line 144 was always true

145 abstract_language = abstract_tag.get("langauge", None) 

146 if abstract_language is None or isinstance(abstract_language, list): 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true

147 abstract_language = "eng" 

148 xarticle.abstracts.append( 

149 { 

150 "tag": "abstract", 

151 "value_tex": abstract_tag.text, 

152 "lang": abstract_language or self.detect_language(abstract_tag.text) or "und", 

153 } 

154 ) 

155 

156 keywords_tag = article_tag.select_one("keywords") 

157 if keywords_tag: 157 ↛ 167line 157 didn't jump to line 167 because the condition on line 157 was always true

158 keywords_language = keywords_tag.get("language", "eng") 

159 if keywords_language is None or isinstance(keywords_language, list): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 keywords_language = "eng" 

161 for kwd_tag in keywords_tag.select("keyword"): 

162 subject = create_subj() 

163 subject["value"] = kwd_tag.text 

164 subject["lang"] = "en" 

165 xarticle.kwds.append(subject) 

166 

167 msc_tag = article_tag.select_one("MSCs") 

168 if msc_tag: 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true

169 for msc_subj in msc_tag.select("MSC"): 

170 subject = create_subj() 

171 subject["value"] = msc_subj.text 

172 subject["type"] = "msc" 

173 subject["lang"] = "en" 

174 xarticle.kwds.append(subject) 

175 

176 pdf_location_tag = article_tag.select_one("filelocation") 

177 pdf_name_tag = article_tag.select_one("file") 

178 if pdf_location_tag and pdf_name_tag: 

179 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text 

180 add_pdf_link_to_xarticle(xarticle, pdf_url) 

181 

182 volume_tag = article_tag.select_one("volume") 

183 issue_tag = article_tag.select_one("issue") 

184 if volume_tag is None or issue_tag is None: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 raise ValueError( 

186 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article" 

187 ) 

188 

189 # Citations ? 

190 

191 return xarticle, volume_tag.text, issue_tag.text