source: golgotha/src/test/web_crawl/crawl.cc @ 80

Last change on this file since 80 was 80, checked in by Sam Hocevar, 14 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 9.1 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "window/window.hh"
10#include "app/app.hh"
11#include "main/main.hh"
12#include "window/style.hh"
13#include "file/file.hh"
14#include "loaders/load.hh"
15#include "window/wmanager.hh"
16#include "math/transform.hh"
17#include "gui/text.hh"
18#include "gui/button.hh"
19#include "gui/image_win.hh"
20#include "loaders/load.hh"
21
22#include "window/colorwin.hh"
23#include "gui/li_pull_menu.hh"
24#include "menu/pull.hh"
25#include "font/anti_prop.hh"
26#include "lisp/lisp.hh"
27#include "win.hh"
28#include "url.hh"
29#include "memory/array.hh"
30#include "checksum/checksum.hh"
31#include "string/string.hh"
32#include <ctype.h>
33
34enum { FB_MAX_THREADS=100 };
35i4_event_handler_reference_class<fb_thread_window> fb_thread_windows[FB_MAX_THREADS];
36
37li_object *fb_add_thread(li_object *o, li_environment *env=0)
38{
39  for (int i=0; i<FB_MAX_THREADS; i++)
40  {
41    if (!fb_thread_windows[i].get())
42    {     
43      i4_graphical_style_class *style=i4_current_app->get_style();
44 
45      fb_thread_window *fb=new fb_thread_window(300, 35, style);
46
47      i4_parent_window_class *mp;
48 
49      if (o) 
50        mp=style->create_mp_window(li_int::get(li_first(o,0),0)->value(),
51                                   li_int::get(li_second(o,0),0)->value(),
52                                   fb->width(), fb->height(), "Thread");
53      else
54        mp=style->create_mp_window(30,30, fb->width(), fb->height(), "Thread");
55
56      mp->add_child(0,0, fb);
57      fb_thread_windows[i]=fb;
58     
59      return li_true_sym;
60    }
61  }
62  return 0;
63}
64
65li_object *fb_add_page(li_object *o, li_environment *env);
66li_object *fb_visited(li_object *o, li_environment *env);
67                       
68
69i4_event_handler_reference_class<i4_text_window_class> fb_counts_window;
70
71class fb_app : public i4_application_class
72{
73public:
74  enum { QUIT };
75
76
77  void init()
78  {       
79    i4_application_class::init();
80       
81    i4_graphical_style_class *style=get_style();
82
83    i4_parent_window_class *cw=i4_add_color_window(wm, 0, style, 0,0, wm->width(), wm->height());
84
85    i4_image_class *logo=i4_load_image("logo.jpg");
86
87    cw->add_child(wm->width()/2-logo->width()/2,
88                  wm->height()/2-logo->height()/2,
89                  new i4_image_window_class(logo, i4_T, i4_F));
90
91    style->icon_hint->background_bitmap=i4_load_image("background.tga");
92   
93    fb_counts_window=new i4_text_window_class("Visited 000000, Unvisited 000000",
94                                              style, style->font_hint->small_font);
95   
96    wm->add_child(wm->width()-fb_counts_window->width(), wm->height()-fb_counts_window->height(),
97              fb_counts_window.get());
98
99    li_create_pull_menu("menu.scm")->show(cw, 0,0);
100       
101    li_add_function("add_thread", fb_add_thread);
102    li_add_function("add_page", fb_add_page);
103    li_add_function("visited", fb_visited);
104
105    i4_file_class *fp=i4_open("filebot.scm");
106    if (!fp)
107    {
108      fp=i4_open("default.scm");
109      if (!fp)
110        i4_error("no default?");
111      int size=fp->size();
112      i4_file_class *out=i4_open("filebot.scm",I4_WRITE);
113      if (!out) i4_error("couldn't write to filebot.scm");
114      char buf[2000];
115      fp->read(buf, size);
116      out->write(buf,size);
117      delete fp;
118      delete out;
119     
120    } else delete fp;
121   
122    li_load("filebot.scm");
123   
124    i4_mkdir(li_string::get(li_get_value("save_path",0),0)->value());   
125  } 
126 
127  i4_array<w32> visited_urls;
128  i4_array<fb_url *> urls_to_visit;
129
130  fb_app()
131    : visited_urls(0, 1024),
132      urls_to_visit(0, 1024)
133  {
134  }
135
136  void uninit()
137  {
138    visited_urls.uninit();        // free up memory associated with the arrays
139    urls_to_visit.uninit();
140    i4_application_class::uninit();
141  }
142
143
144  fb_url *get_best_url()
145  {
146    int i;
147    for (i=0; i<urls_to_visit.size(); i++)
148      if (urls_to_visit[i])
149      {
150        char *name=urls_to_visit[i]->full_name;
151        if (name[strlen(name)-1]=='3' && (strstr(name, ".mp3") || strstr(name, ".MP3")))
152        {
153          fb_url *ret=urls_to_visit[i];
154          urls_to_visit[i]=0;
155          return ret;
156        }
157      }
158
159       
160    for (i=0; i<urls_to_visit.size(); i++)
161      if (urls_to_visit[i])
162      {
163        char *name=urls_to_visit[i]->full_name;
164        if (strstr(name, "mp3") || strstr(name, "MP3"))
165        {
166          fb_url *ret=urls_to_visit[i];
167          urls_to_visit[i]=0;
168          return ret;
169        }
170      }
171
172
173    for (i=0; i<urls_to_visit.size(); i++)
174      if (urls_to_visit[i])
175      {
176        fb_url *ret=urls_to_visit[i];
177        urls_to_visit[i]=0;
178        return ret;
179      }
180
181    return 0;
182  }
183 
184  void add_new_url(char *uname, fb_url *current_url)
185  {
186    fb_url *url_copy=new fb_url(uname, current_url);
187   
188    // see if we have visited this site already.  Instead of storing the whole string
189    // for places we've been we store the checksum of the name
190    w32 csum=url_copy->checksum();
191
192    int i;
193    for (i=0; i<visited_urls.size(); i++)
194      if (visited_urls[i]==csum)
195        return;                             // we've already been there
196
197    for (i=0; i<urls_to_visit.size(); i++)
198      if (!urls_to_visit[i])
199      {
200        urls_to_visit[i]=url_copy;
201        return ;
202      }
203   
204    urls_to_visit.add(url_copy);
205
206  }
207
208 
209  void scan_url_data(char *s, int len, fb_url *current_url)
210  {
211    len-=10;            // need this much space to have a href
212    char uname[400];   // store temporary hrefs here
213       
214    for (int i=0; i<len;)
215    {
216      // is this a href?
217      if (fb_strneq(s+i, "<a href=", 8)==1)
218      {
219        i+=8;
220        while (s[i] && s[i]!='"') i++;      // find quote
221        i++;                                // skip it
222
223
224        char *up=uname;                     // save the rest of the href
225        int t=0;
226        while (s[i] && s[i]!='"')
227        {
228          t++;
229          if (t==398) s[i]=0;
230          else
231            *(up++)=s[i++];
232           
233        }
234        *up=0;
235        i++;
236
237
238        add_new_url(uname, current_url);
239      }
240      else i++;
241    }
242  }
243
244 
245  void calc_model()
246  {
247    for (int i=0; i<FB_MAX_THREADS; i++)
248    {
249      if (fb_thread_windows[i].get())
250      {
251        int state=fb_thread_windows[i]->get_state();
252        if (state==FB_DONE_READING)       // has url data it wan't us to process
253        {
254          w8 *buf=fb_thread_windows[i]->save_buffer;
255          int size=fb_thread_windows[i]->save_buffer_size;
256          fb_url *url=fb_thread_windows[i]->get_url();
257
258
259          scan_url_data((char *)buf, size, url);   // scan buffer for new url's to follow
260
261          i4_file_class *fp=i4_open("visited_urls.scm", I4_APPEND);
262          if (fp)
263          {
264            fp->printf("(visited \"%s\") ; %d %s \n", url->full_name, size,
265                       fb_thread_windows[i]->get_error_string());
266            delete fp;
267          }
268           
269         
270          // reset this thread
271          fb_thread_windows[i]->save_buffer_size=0;
272          state=FB_WAITING;                  // ready to accept new commands
273          fb_thread_windows[i]->ack_data();  // tell window we are done with it's data
274
275        }
276
277        if (state==FB_WAITING)     // is the thread ready for a new request?
278        {
279          fb_url *best=get_best_url();
280          if (best)
281          {
282           
283            visited_urls.add(best->checksum());
284            fb_thread_windows[i]->set_url(best);
285          }
286        }
287
288       
289        fb_thread_windows[i]->update();
290       
291      }
292
293     
294    }
295
296   
297    if (fb_counts_window.get())
298    {
299      int t=0;
300      for (int i=0; i<urls_to_visit.size(); i++)
301        if (urls_to_visit[i]) t++;
302     
303      char buf[100];
304      sprintf(buf, "Visited %d, Unvisited %d", visited_urls.size(), t);
305      fb_counts_window->set_text(new i4_str(buf));
306    }
307
308  }
309
310  void receive_event(i4_event *ev)   
311  {
312 
313    if (ev->type()==i4_event::DO_COMMAND)
314    {
315      char *cmd=((i4_do_command_event_class *)ev)->command;
316      if (cmd)
317      {
318        if (strcmp(cmd, "exit")==0)
319          quit();
320        else
321          li_call(cmd);
322      }
323    }       
324  }
325
326  char *name() { return "fb_app"; }
327} *file_bot;
328
329li_object *fb_visited(li_object *o, li_environment *env)
330{
331  char *s=li_string::get(li_first(o,0),0)->value();
332  w32 checksum=i4_check_sum32(s,strlen(s));   
333  file_bot->visited_urls.add(checksum);
334
335  return 0;
336}
337
338li_object *fb_add_page(li_object *o, li_environment *env)
339{
340  file_bot->add_new_url(li_string::get(li_first(o,0),0)->value(), 0);
341  return 0;
342}
343
344void i4_main(w32 argc, i4_const_str *argv)
345{
346  fb_app f;
347  file_bot=&f;
348  file_bot->run();
349}
350
351
Note: See TracBrowser for help on using the repository browser.