Coverage for src/crawler/by_source/nsjom/nsjom_xml_crawler.py: 85%
126 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import re
2import typing
4from bs4 import BeautifulSoup, Tag
6# from ptf.model_data import create_publisherdata
7from ptf.model_data import (
8 IssueData,
9 create_articledata,
10 create_contributor,
11 create_extlink,
12 create_issuedata,
13 create_publisherdata,
14 create_subj,
15)
17from crawler.utils import add_pdf_link_to_xarticle
19if typing.TYPE_CHECKING: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true
20 from ..nsjom_crawler import NsjomCrawler
22source_domain = "NSJOM"
25def parse_collection_content(
26 self: "NsjomCrawler",
27 _: str,
28 periode_start: int = 0,
29 periode_end: float = float("inf"),
30 source_domain: str = "NSJOM",
31 xissue_pid_to_parse: str | None = None,
32):
33 """
34 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
35 From 2015 to today
36 """
37 xissues: dict[tuple[str, str], IssueData] = {}
38 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml"
39 content = self.download_file(url)
40 soup = BeautifulSoup(content, "lxml-xml")
41 record_container_element = soup.select_one("records")
42 if record_container_element is None: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError(f"[{source_domain}] Cannot parse source")
44 for record_element in record_container_element.select("record"):
45 publication_type_tag = record_element.select_one("publicationType")
46 if publication_type_tag is None: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 raise ValueError(f"[{source_domain}] Cannot determine article publicationType")
48 if publication_type_tag.text != "published":
49 continue
50 year_tag = record_element.select_one("year")
51 if year_tag is None or year_tag.text == "": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError(f"[{source_domain}] Cannot parse year from article")
53 year = int(year_tag.text)
54 if periode_start > year or year > periode_end:
55 continue
56 xarticle, volume_number, issue_number = parse_article(
57 self, record_element, source_domain=source_domain
58 )
59 if (volume_number, issue_number) not in xissues:
60 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}"
61 if xissue_pid_to_parse and xissue_pid_to_parse != pid:
62 continue
63 xissue = create_issuedata()
64 parse_issue_tag(xissue, record_element, year)
65 xissue.year = year_tag.text
66 xissue.volume = volume_number
67 xissue.number = issue_number
68 xissue.pid = pid
69 xissues[(volume_number, issue_number)] = xissue
70 xissues[(volume_number, issue_number)].articles.append(xarticle)
72 return list(xissues.values())
75def parse_issue_content(self: "NsjomCrawler", content: str, xissue: IssueData):
76 if not xissue.year: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 raise ValueError("Issue year is not set")
78 return parse_collection_content(
79 self, content, int(xissue.year), int(xissue.year), source_domain, xissue.pid
80 )
83def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData:
84 publisher_tag = article_tag.select_one("publisher")
85 if publisher_tag: 85 ↛ 90line 85 didn't jump to line 90 because the condition on line 85 was always true
86 xpub = create_publisherdata()
87 xpub.name = publisher_tag.text
88 xissue.publisher = xpub
90 ext_link = create_extlink(
91 rel="source",
92 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
93 metadata=source_domain,
94 )
95 xissue.ext_links.append(ext_link)
96 return xissue
99def parse_article(self: "NsjomCrawler", article_tag: Tag, source_domain: str = "NSJOM"):
100 xarticle = create_articledata()
102 doi_tag = article_tag.select_one("doi")
103 if doi_tag is None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 raise ValueError(f"[{source_domain}] : Article doi not found")
105 xarticle.doi = doi_tag.text
106 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text)
108 page_start_tag = article_tag.select_one("startPage")
109 page_end_tag = article_tag.select_one("endPage")
110 if page_start_tag: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true
111 xarticle.fpage = page_start_tag.text
112 if page_end_tag: 112 ↛ 115line 112 didn't jump to line 115 because the condition on line 112 was always true
113 xarticle.lpage = page_end_tag.text
115 date_published_tag = article_tag.select_one("publicationDate")
116 if date_published_tag: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true
117 xarticle.date_published_iso_8601_date_str = date_published_tag.text
119 url_tag = article_tag.select_one("publisherRecordId")
120 if url_tag: 120 ↛ 128line 120 didn't jump to line 128 because the condition on line 120 was always true
121 ext_link = create_extlink(
122 rel="source",
123 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}",
124 metadata=source_domain,
125 )
126 xarticle.ext_links.append(ext_link)
128 title_tag = article_tag.select_one("title")
129 if title_tag: 129 ↛ 134line 129 didn't jump to line 134 because the condition on line 129 was always true
130 xarticle.title_tex = title_tag.text
132 # TODO : Affiliations ?
134 authors_container = article_tag.select_one("authors")
135 if authors_container: 135 ↛ 149line 135 didn't jump to line 149 because the condition on line 135 was always true
136 for author_tag in authors_container.select("author"):
137 author = create_contributor(role="author")
138 author_name_tag = author_tag.select_one("name")
139 if author_name_tag: 139 ↛ 141line 139 didn't jump to line 141 because the condition on line 139 was always true
140 author["string_name"] = author_name_tag.text
141 corresponding = author_tag.get("corresponding")
142 if corresponding == "1":
143 author["corresponding"] = True
144 email_tag = author_tag.select_one("email")
145 if email_tag:
146 author["email"] = email_tag.text
147 xarticle.contributors.append(author)
149 abstract_tag = article_tag.select_one("abstract")
150 if abstract_tag: 150 ↛ 162line 150 didn't jump to line 162 because the condition on line 150 was always true
151 abstract_language = abstract_tag.get("langauge", None)
152 if abstract_language is None or isinstance(abstract_language, list): 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was always true
153 abstract_language = "eng"
154 xarticle.abstracts.append(
155 {
156 "tag": "abstract",
157 "value_tex": abstract_tag.text,
158 "lang": abstract_language or self.detect_language(abstract_tag.text) or "und",
159 }
160 )
162 keywords_tag = article_tag.select_one("keywords")
163 if keywords_tag: 163 ↛ 173line 163 didn't jump to line 173 because the condition on line 163 was always true
164 keywords_language = keywords_tag.get("language", "eng")
165 if keywords_language is None or isinstance(keywords_language, list): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 keywords_language = "eng"
167 for kwd_tag in keywords_tag.select("keyword"):
168 subject = create_subj()
169 subject["value"] = kwd_tag.text
170 subject["lang"] = "en"
171 xarticle.kwds.append(subject)
173 msc_tag = article_tag.select_one("MSCs")
174 if msc_tag: 174 ↛ 182line 174 didn't jump to line 182 because the condition on line 174 was always true
175 for msc_subj in msc_tag.select("MSC"):
176 subject = create_subj()
177 subject["value"] = msc_subj.text
178 subject["type"] = "msc"
179 subject["lang"] = "en"
180 xarticle.kwds.append(subject)
182 pdf_location_tag = article_tag.select_one("filelocation")
183 pdf_name_tag = article_tag.select_one("file")
184 if pdf_location_tag and pdf_name_tag:
185 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text
186 add_pdf_link_to_xarticle(xarticle, pdf_url)
188 volume_tag = article_tag.select_one("volume")
189 issue_tag = article_tag.select_one("issue")
190 if volume_tag is None or issue_tag is None: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 raise ValueError(
192 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article"
193 )
195 # Citations ?
197 return xarticle, volume_tag.text, issue_tag.text