Coverage for src / crawler / abstract_crawlers / matching_crawler.py: 48%

25 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-05-21 12:58 +0000

1from concurrent.futures import ( 

2 Executor, 

3 ThreadPoolExecutor, 

4) 

5from typing import TYPE_CHECKING 

6 

7from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler 

8from crawler.by_source.slc_crawler import IssueData 

9from crawler.cmds.augment.augment_cmd import AugmentArticlesCmd 

10 

11if TYPE_CHECKING: 

12 from concurrent.futures import Future 

13 

14 from crawler.cmds.augment.zbmath import MatchingOperationMessage 

15 

16 

17class MatchingCrawler(ThreadedCrawler): 

18 zbmath_executor: Executor 

19 zbmath_futures: "list[Future]" 

20 exception: BaseException | None = None 

21 

22 matching_event_messages: "list[MatchingOperationMessage]" 

23 

24 def __init__( 

25 self, 

26 # backend=None, 

27 *args, 

28 **kwargs, 

29 ): 

30 super().__init__(*args, **kwargs) 

31 self.zbmath_executor = ThreadPoolExecutor( 

32 max_workers=1, thread_name_prefix="crawler_zbmath_thread" 

33 ) 

34 self.zbmath_futures = [] 

35 self.matching_event_messages = [] 

36 

37 def _issue_added_callback(self, future: "Future[IssueData]"): 

38 # Runs in database thread 

39 container = future.result() 

40 if not container.pid: 

41 raise ValueError("Issue doesn't have any pid") 

42 matching_future = self.zbmath_executor.submit(self.match_zbmath_issue, container.pid) 

43 self.zbmath_futures.append(matching_future) 

44 

45 def match_zbmath_issue(self, issue_pid: str): 

46 cmd = AugmentArticlesCmd( 

47 { 

48 "backend": self.backend, 

49 "collection_pid": self.collection_id, 

50 "source_id": self.source_domain, 

51 "issue_pids": [issue_pid], 

52 } 

53 ) 

54 cmd.do() 

55 self.matching_event_messages.extend(cmd.event_messages)