1+ import  markdown 
2+ from  bs4  import  BeautifulSoup , NavigableString , Tag 
3+ import  string 
4+ 5+ 6+ class  Helper :
7+  @staticmethod  
8+  def  parse_gfm_section (html_content ):
9+  """ 
10+  Parse a GitHub-Flavored Markdown section containing a table and surrounding content. 
11+  Returns a dict with "before_html", "columns", "rows_html", and "after_html". 
12+  """ 
13+  html  =  markdown .markdown (html_content , extensions = ['extra' ])
14+  soup  =  BeautifulSoup (html , "html.parser" )
15+ 16+  table  =  soup .find ('table' )
17+  if  not  table :
18+  # If no table, treat entire content as before_html 
19+  return  {"before_html" : html , "columns" : [], "rows_html" : [], "after_html" : '' }
20+ 21+  # Collect HTML before the table 
22+  before_parts  =  [str (elem ) for  elem  in  table .find_previous_siblings ()]
23+  before_html  =  '' .join (reversed (before_parts ))
24+ 25+  # Collect HTML after the table 
26+  after_parts  =  [str (elem ) for  elem  in  table .find_next_siblings ()]
27+  after_html  =  '' .join (after_parts )
28+ 29+  # Extract table headers 
30+  headers  =  [th .get_text (strip = True ) for  th  in  table .find_all ('th' )]
31+ 32+  # Extract table rows (skip header) 
33+  rows_html  =  []
34+  for  tr  in  table .find_all ('tr' )[1 :]:
35+  cells  =  [str (td ) for  td  in  tr .find_all ('td' )]
36+  rows_html .append (cells )
37+ 38+  return  {
39+  "before_html" : before_html ,
40+  "columns" : headers ,
41+  "rows_html" : rows_html ,
42+  "after_html" : after_html 
43+  }
44+ 45+  @staticmethod  
46+  def  parse_cell (html_td ):
47+  """Convert a table cell HTML into plain text or a dict for links/images.""" 
48+  soup  =  BeautifulSoup (html_td , "html.parser" )
49+  a  =  soup .find ('a' )
50+  if  a :
51+  cell  =  {"url" : a .get ('href' , '' )}
52+  img  =  a .find ('img' )
53+  if  img :
54+  cell .update ({
55+  "img_src" : img .get ('src' , '' ),
56+  "title" : img .get ('title' , '' ),
57+  "link_text" : a .get_text (strip = True )
58+  })
59+  else :
60+  cell ["link_text" ] =  a .get_text (strip = True )
61+  return  cell 
62+  return  soup .get_text (strip = True )
63+ 64+  @staticmethod  
65+  def  parse_html_parts (html_fragment ):
66+  """ 
67+  Convert an HTML fragment into a list of parts. 
68+  Each part is either: 
69+  - {"text": "..."} 
70+  - {"link": "url", "text": "..."} 
71+  - {"img_src": "url", "alt": "...", "title": "..."} 
72+  """ 
73+  soup  =  BeautifulSoup (html_fragment , 'html.parser' )
74+  parts  =  []
75+ 76+  def  handle_element (elem ):
77+  if  isinstance (elem , NavigableString ):
78+  text  =  str (elem ).strip ()
79+  if  text  and  not  all (ch  in  string .punctuation  for  ch  in  text ):
80+  parts .append ({"text" : text })
81+  elif  isinstance (elem , Tag ):
82+  if  elem .name  ==  'a' :
83+  href  =  elem .get ('href' , '' )
84+  txt  =  elem .get_text (strip = True )
85+  parts .append ({"link" : href , "text" : txt })
86+  elif  elem .name  ==  'img' :
87+  parts .append ({
88+  "img_src" : elem .get ('src' , '' ),
89+  "alt" : elem .get ('alt' , '' ),
90+  "title" : elem .get ('title' , '' )
91+  })
92+  else :
93+  # Recurse into children for nested tags 
94+  for  child  in  elem .children :
95+  handle_element (child )
96+ 97+  for  element  in  soup .contents :
98+  handle_element (element )
99+ 100+  return  parts 
101+ 102+  @staticmethod  
103+  def  section_to_json (section_result ):
104+  """ 
105+  Convert a parsed section into structured JSON. 
106+  Returns {"before": [...], "table": [...], "after": [...]}. 
107+  """ 
108+  # Build JSON rows for the table 
109+  table_rows  =  []
110+  cols  =  section_result .get ('columns' , [])
111+  for  row_html  in  section_result .get ('rows_html' , []):
112+  cells  =  [Helper .parse_cell (cell_html ) for  cell_html  in  row_html ]
113+  table_rows .append (dict (zip (cols , cells )))
114+ 115+  return  {
116+  "before" : Helper .parse_html_parts (section_result .get ('before_html' , '' )),
117+  "table" : table_rows ,
118+  "after" : Helper .parse_html_parts (section_result .get ('after_html' , '' ))
119+  }
0 commit comments