#!/usr/local/bin/rebol -cs

REBOL [
  Title: "Server object handler script"
  File: %server.r
  Author: [ "HY" ]
  Purpose: { This script will parse robot.txt files and store
             a hash! of forbidden paths. This is very useful for
             webbots or spiders of any kind (at least if obeying
             robot exclusion standards is desirable).

             I looked at the ht://dig package (http://www.htdig.org/)
             script that does the same thing as this script to see if I
             had overlooked something, and thereby fixed a few bugs.
           }
  Date: 16-Aug-2003
  Examples: {
     politebot: server "PoliteBot"
     politebot/add reduce ["www.netscape.com" read http://www.netscape.com/robots.txt
                   "www.google.com" read http://www.google.com/robots.txt ]
     print politebot/forbidden? http://www.google.com/search?q=rebol&ie=UTF-8&oe=UTF-8&hl=no
     print politebot/forbidden? http://www.netscape.com/
  }
  History: [
             19-Oct-2004 {Removed a bug that caused this path: /rebol/index.html
                          to be forbidden if robots.txt had Disallow: /i
                          I should do something about port numbers in URLs as well.}
             23-Sep-2004 {Added the library header}
             22-Apr-2004 {Removed a bug that modified an incoming variable
                          (from other scripts) directly.}
           ]
  Library: [
    level: 'intermediate
    domain: [parse http text-processing web]
    license: none
    Platform: 'all
    Tested-under: none
    Type: [module]
    Support: none
  ]
]

server-handler-object: context [

  ; 'context should automatically pick up set-words
  ; and bind them to the new local context, so:
  bot-name: ""
  patterns: make hash! copy []

  add: func [ block [block!] /local seen-my-name? pay-attention? my-patterns] [

    foreach [ host robots-file ] block [
      seen-my-name?: false
      pay-attention?: false
      my-patterns: copy []

      if none? robots-file [robots-file: ""]

      lines: parse/all robots-file "^/"

      forall lines [
        line: first lines
        if hash: find line "#" [ line: head remove/part find line "#" length? hash ]

        if all [0 < length? line #"#" = first line] [
          line: ""
        ]

        parsed: parse/all line ":"
        if not 0 = length? parsed [

          name: trim first parsed
          either 1 < length? parsed [
            rest: trim first next parsed
          ] [
            rest: ""
          ]


          if all [name = "user-agent"] [
            either all [rest = "*" not seen-my-name?] [
              pay-attention?: true
            ] [
              either rest = bot-name [
                either not seen-my-name? [
                  seen-my-name?: true
                  pay-attention?: true
                  clear my-patterns ; ignore previous messages for "*"
                ] [
                  pay-attention?: false ; only take first section with our name
                ]
              ] [
                pay-attention?: false ; none of our business
              ] ; end either rest = bot-name
            ] ; end either rest is * and not seen my name
          ] ; end name = user-agent

          if all [pay-attention? name = "disallow"] [
            if not 0 = length? rest [
              append my-patterns rest
            ]
          ] ; end pay attention and name is disallow

        ] ; end length? parsed not 0

      ] ; end forall lines

      append patterns reduce [host my-patterns]
    ]

  ] ; end add

  got-robots-file?: func [
                      "Tells whether or not we have the robots.txt file for a given host"
                      host [string!] "The hostname to look for"
                    ] [
    find patterns host
  ]

  forbidden?: func [
               "Tells whether or not a given URL is forbidden by the robots.txt file"
               in-url [object! string! url!]
               {The URL to check. This must be a complete URL. If this is an
               object, it is assumed to be a url-handler object
               (http://www.oops-as.no/roy/rebol-scripts/url-handler.r).
              }
                    ] [

    url: copy in-url ; don't modify incoming variable!

    if url? url [url: to-string url ]

    if string? url [

      if "" = url [return false] ; this might be just a bit too quick ...

      ; assume this is a complete url:
      url: url-handler url

    ]

    ; allowing objects, but assuming they are url-handler objects
    ; (http://www.rebol.org/cgi-bin/cgiwrap/rebol/view-script.r?color=yes&script=url-handler.r)
    path: find/match url/plain join url/protocol url/host

    pattern: select patterns url/host
    if none? pattern [return false] ; Maybe load robots.txt instead, to check for real?

    foreach p pattern [
      if found? find/part path p length? p [
        return true
      ]
    ]

    false

  ]

]

; shortcut:
server: func [
               "Shorthand for make server-handler-object [ bot-name: name ]"
               name [string!
               "The bot's name. This is the name that we will look for in the robots.txt files."]
             ] [

  return make server-handler-object [ bot-name: name ]

]