Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 79%
109 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true
17 from ..nsjom_crawler import NsjomCrawler
19source_domain = "NSJOM"
22def parse_collection_content(
23 self: "NsjomCrawler",
24 content: str,
25 periode_start: int = 0,
26 periode_end: int = datetime.now().year,
27 source_domain: str = "NSJOM",
28):
29 """
30 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
31 From 2010 to 2014 (included)
32 """
33 xissues: list[IssueData] = []
34 year_start = max(2010, periode_start)
35 year_end = min(2014, periode_end)
37 for year in range(year_start, year_end + 1):
38 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
39 year_content = self.download_file(url)
40 xissues = xissues + parse_year(self, year_content, year, url)
42 return xissues
45def is_heading(element: Tag):
46 if element.select_one(".HeadingNSJOM"):
47 return True
48 classname = element.get("class")
49 if not classname:
50 return False
52 if isinstance(classname, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 if classname == "HeadingNSJOM":
54 return True
55 return False
57 if "HeadingNSJOM" in classname:
58 return True
59 return False
62def parse_issue_content(self, content: str, xissue: IssueData):
63 if not xissue.year:
64 raise ValueError("Issue year is not set")
65 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
68def parse_year(
69 self: "NsjomCrawler",
70 content: str,
71 year: int,
72 url: str | None = None,
73 pid_to_crawl: str | None = None,
74):
75 soup = BeautifulSoup(content, "html.parser")
76 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a")
77 issues: list[list[Tag]] = []
78 # Sort tags into issues
79 for current_element in page_elements:
80 if is_heading(current_element):
81 issues.append([current_element])
82 continue
83 if current_element.text == "\xa0":
84 continue
85 issues[-1].append(current_element)
87 xissues: list[IssueData] = []
88 for issue_elements in issues:
89 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl)
90 if not xissue: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 continue
92 xissue.url = url
93 xissues.append(xissue)
95 return xissues
98def parse_issue_tags(
99 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None
100):
101 xissue = create_issuedata()
102 xissue.year = str(year)
103 ext_link = create_extlink(
104 rel="source",
105 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
106 metadata=source_domain,
107 )
108 xissue.ext_links.append(ext_link)
109 issue_title_tag = tags.pop(0)
110 match = re.search(
111 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text
112 )
113 if match is None: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 raise ValueError("Cannot find volume number")
115 volume_number = match.group("Volume")
116 issue_number = match.group("Issue")
117 if volume_number: 117 ↛ 120line 117 didn't jump to line 120 because the condition on line 117 was always true
118 xissue.volume = volume_number
119 else:
120 raise ValueError("Cannot read volume number")
121 if issue_number: 121 ↛ 123line 121 didn't jump to line 123 because the condition on line 121 was always true
122 xissue.number = issue_number
123 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
125 if pid_to_crawl and xissue.pid != pid_to_crawl: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 return
128 while len(tags) > 0:
129 article_meta = tags.pop(0)
130 article_meta_text = cleanup_str(article_meta.text)
131 if article_meta_text == "": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 continue
134 article_title = tags.pop(0)
135 article = parse_article(
136 self,
137 article_meta_text,
138 article_title,
139 xissue.pid,
140 len(xissue.articles),
141 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
142 )
143 if article is None: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 continue
145 xissue.articles.append(article)
146 return xissue
149def parse_article(
150 self: "NsjomCrawler",
151 meta_text: str,
152 title_tag: Tag,
153 issue_pid: str,
154 index: int,
155 source: str | None = None,
156):
157 xarticle = create_articledata()
158 if source: 158 ↛ 161line 158 didn't jump to line 161 because the condition on line 158 was always true
159 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
160 xarticle.ext_links.append(ext_link)
161 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text)
162 if match is None: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 raise ValueError("Cannot parse authors or page number")
164 self.set_pages(xarticle, match.group("Pages"))
165 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors"))
166 for a in authors:
167 author = create_contributor(role="author", string_name=a)
168 xarticle.contributors.append(author)
170 article_pdf_link = title_tag.get("href")
171 if article_pdf_link is None: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 print("[NSJOM] article does not have a pdf")
173 return None
174 if isinstance(article_pdf_link, list): 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 raise ValueError("Article link is a list")
176 pdf_link = self.source_website + article_pdf_link
177 add_pdf_link_to_xarticle(xarticle, pdf_link)
179 title = cleanup_str(title_tag.text)
180 xarticle.title_tex = title
181 xarticle.pid = f"{issue_pid}a_{index}"
183 return xarticle