Coverage for src / crawler / by_source / nsjom / nsjom_xml_crawler.py: 86%
122 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import re
2import typing
4from bs4 import BeautifulSoup, Tag
5from ptf.model_data import (
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12 create_publisherdata,
13 create_subj,
14)
16from crawler.utils import add_pdf_link_to_xarticle
18if typing.TYPE_CHECKING:
19 from .nsjom_crawler import NsjomCrawler
21source_domain = "NSJOM"
24def parse_collection_content(
25 self: "NsjomCrawler",
26 _: str,
27 source_domain: str = "NSJOM",
28 xissue_pid_to_parse: str | None = None,
29):
30 """
31 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
32 From 2015 to today
33 """
34 xissues: dict[tuple[str, str], IssueData] = {}
35 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml"
36 content = self.download_file(url)
37 soup = BeautifulSoup(content, "lxml-xml")
38 record_container_element = soup.select_one("records")
39 if record_container_element is None: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 raise ValueError(f"[{source_domain}] Cannot parse source")
41 for record_element in record_container_element.select("record"):
42 publication_type_tag = record_element.select_one("publicationType")
43 if publication_type_tag is None: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 raise ValueError(f"[{source_domain}] Cannot determine article publicationType")
45 if publication_type_tag.text != "published":
46 continue
47 year_tag = record_element.select_one("year")
48 if year_tag is None or year_tag.text == "": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 raise ValueError(f"[{source_domain}] Cannot parse year from article")
50 year = int(year_tag.text)
51 xarticle, volume_number, issue_number = parse_article(
52 self, record_element, source_domain=source_domain
53 )
54 if (volume_number, issue_number) not in xissues:
55 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}"
56 if xissue_pid_to_parse and xissue_pid_to_parse != pid:
57 continue
58 xissue = create_issuedata()
59 parse_issue_tag(xissue, record_element, year)
60 xissue.year = year_tag.text
61 xissue.volume = volume_number
62 xissue.number = issue_number
63 xissue.pid = pid
64 xissues[(volume_number, issue_number)] = xissue
65 xissues[(volume_number, issue_number)].articles.append(xarticle)
67 return list(xissues.values())
70def parse_issue_content(self: "NsjomCrawler", content: str, xissue: IssueData):
71 if not xissue.year: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raise ValueError("Issue year is not set")
73 return parse_collection_content(self, content, source_domain, xissue.pid)
76def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData:
77 publisher_tag = article_tag.select_one("publisher")
78 if publisher_tag: 78 ↛ 83line 78 didn't jump to line 83 because the condition on line 78 was always true
79 xpub = create_publisherdata()
80 xpub.name = publisher_tag.text
81 xissue.publisher = xpub
83 ext_link = create_extlink(
84 rel="source",
85 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
86 metadata=source_domain,
87 )
88 xissue.ext_links.append(ext_link)
89 return xissue
92def parse_article(self: "NsjomCrawler", article_tag: Tag, source_domain: str = "NSJOM"):
93 xarticle = create_articledata()
95 doi_tag = article_tag.select_one("doi")
96 if doi_tag is None: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 raise ValueError(f"[{source_domain}] : Article doi not found")
98 xarticle.doi = doi_tag.text
99 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text)
101 page_start_tag = article_tag.select_one("startPage")
102 page_end_tag = article_tag.select_one("endPage")
103 if page_start_tag: 103 ↛ 105line 103 didn't jump to line 105 because the condition on line 103 was always true
104 xarticle.fpage = page_start_tag.text
105 if page_end_tag: 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true
106 xarticle.lpage = page_end_tag.text
108 date_published_tag = article_tag.select_one("publicationDate")
109 if date_published_tag: 109 ↛ 112line 109 didn't jump to line 112 because the condition on line 109 was always true
110 xarticle.date_published_iso_8601_date_str = date_published_tag.text
112 url_tag = article_tag.select_one("publisherRecordId")
113 if url_tag: 113 ↛ 121line 113 didn't jump to line 121 because the condition on line 113 was always true
114 ext_link = create_extlink(
115 rel="source",
116 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}",
117 metadata=source_domain,
118 )
119 xarticle.ext_links.append(ext_link)
121 title_tag = article_tag.select_one("title")
122 if title_tag: 122 ↛ 127line 122 didn't jump to line 127 because the condition on line 122 was always true
123 xarticle.title_tex = title_tag.text
125 # TODO : Affiliations ?
127 authors_container = article_tag.select_one("authors")
128 if authors_container: 128 ↛ 142line 128 didn't jump to line 142 because the condition on line 128 was always true
129 for author_tag in authors_container.select("author"):
130 author = create_contributor(role="author")
131 author_name_tag = author_tag.select_one("name")
132 if author_name_tag: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was always true
133 author["string_name"] = author_name_tag.text
134 corresponding = author_tag.get("corresponding")
135 if corresponding == "1":
136 author["corresponding"] = True
137 email_tag = author_tag.select_one("email")
138 if email_tag:
139 author["email"] = email_tag.text
140 xarticle.contributors.append(author)
142 abstract_tag = article_tag.select_one("abstract")
143 if abstract_tag: 143 ↛ 154line 143 didn't jump to line 154 because the condition on line 143 was always true
144 abstract_language = abstract_tag.get("langauge", None)
145 if abstract_language is None or isinstance(abstract_language, list): 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true
146 abstract_language = "eng"
147 xarticle.abstracts.append(
148 create_abstract(
149 value_tex=abstract_tag.text,
150 lang=abstract_language or self.detect_language(abstract_tag.text) or "und",
151 )
152 )
154 keywords_tag = article_tag.select_one("keywords")
155 if keywords_tag: 155 ↛ 165line 155 didn't jump to line 165 because the condition on line 155 was always true
156 keywords_language = keywords_tag.get("language", "eng")
157 if keywords_language is None or isinstance(keywords_language, list): 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 keywords_language = "eng"
159 for kwd_tag in keywords_tag.select("keyword"):
160 subject = create_subj()
161 subject["value"] = kwd_tag.text
162 subject["lang"] = "en"
163 xarticle.kwds.append(subject)
165 msc_tag = article_tag.select_one("MSCs")
166 if msc_tag: 166 ↛ 174line 166 didn't jump to line 174 because the condition on line 166 was always true
167 for msc_subj in msc_tag.select("MSC"):
168 subject = create_subj()
169 subject["value"] = msc_subj.text
170 subject["type"] = "msc"
171 subject["lang"] = "en"
172 xarticle.kwds.append(subject)
174 pdf_location_tag = article_tag.select_one("filelocation")
175 pdf_name_tag = article_tag.select_one("file")
176 if pdf_location_tag and pdf_name_tag:
177 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text
178 add_pdf_link_to_xarticle(xarticle, pdf_url)
180 volume_tag = article_tag.select_one("volume")
181 issue_tag = article_tag.select_one("issue")
182 if volume_tag is None or issue_tag is None: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 raise ValueError(
184 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article"
185 )
187 # Citations ?
189 return xarticle, volume_tag.text, issue_tag.text