The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems'
require 'mechanize'
require 'logger'
agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
agent.user_agent_alias = 'Mac Safari'
page = agent.get("http://www.google.com/")
search_form = page.form_with(:name => "f")
search_form.field_with(:name => "q").value = "Hello"
search_results = agent.submit(search_form)
puts search_results.body
| VERSION | = | '0.8.5' | The version of Mechanize you are using. | |
| AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases |
| redirect_ok | -> | follow_redirect? |
| ca_file | [RW] | |
| cert | [RW] | |
| conditional_requests | [RW] | |
| cookie_jar | [RW] | |
| follow_meta_refresh | [RW] | |
| history | [R] | |
| history_added | [RW] | |
| html_parser | [RW] | |
| keep_alive | [RW] | |
| keep_alive_time | [RW] | |
| key | [RW] | |
| log | [RW] | |
| open_timeout | [RW] | |
| pass | [RW] | |
| pluggable_parser | [R] | |
| read_timeout | [RW] | |
| redirect_ok | [RW] | |
| redirection_limit | [RW] | |
| scheme_handlers | [RW] | |
| user_agent | [RW] | |
| verify_callback | [RW] | |
| watch_for_set | [RW] |
# File lib/www/mechanize.rb, line 93
93: def initialize
94: # attr_accessors
95: @cookie_jar = CookieJar.new
96: @log = nil
97: @open_timeout = nil
98: @read_timeout = nil
99: @user_agent = AGENT_ALIASES['Mechanize']
100: @watch_for_set = nil
101: @history_added = nil
102: @ca_file = nil # OpenSSL server certificate file
103:
104: # callback for OpenSSL errors while verifying the server certificate
105: # chain, can be used for debugging or to ignore errors by always
106: # returning _true_
107: @verify_callback = nil
108: @cert = nil # OpenSSL Certificate
109: @key = nil # OpenSSL Private Key
110: @pass = nil # OpenSSL Password
111: @redirect_ok = true # Should we follow redirects?
112:
113: # attr_readers
114: @history = WWW::Mechanize::History.new
115: @pluggable_parser = PluggableParser.new
116:
117: # Auth variables
118: @user = nil # Auth User
119: @password = nil # Auth Password
120: @digest = nil # DigestAuth Digest
121: @auth_hash = {} # Keep track of urls for sending auth
122:
123: # Proxy settings
124: @proxy_addr = nil
125: @proxy_pass = nil
126: @proxy_port = nil
127: @proxy_user = nil
128:
129: @conditional_requests = true
130:
131: @follow_meta_refresh = false
132: @redirection_limit = 20
133:
134: # Connection Cache & Keep alive
135: @connection_cache = {}
136: @keep_alive_time = 300
137: @keep_alive = true
138:
139: @scheme_handlers = Hash.new { |h,k|
140: h[k] = lambda { |link, page|
141: raise UnsupportedSchemeError.new(k)
142: }
143: }
144: @scheme_handlers['http'] = lambda { |link, page| link }
145: @scheme_handlers['https'] = @scheme_handlers['http']
146: @scheme_handlers['relative'] = @scheme_handlers['http']
147: @scheme_handlers['file'] = @scheme_handlers['http']
148:
149: @pre_connect_hook = Chain::PreConnectHook.new
150: @post_connect_hook = Chain::PostConnectHook.new
151:
152: yield self if block_given?
153: end
Sets the user and password to be used for authentication.
# File lib/www/mechanize.rb, line 186
186: def auth(user, password)
187: @user = user
188: @password = password
189: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/www/mechanize.rb, line 279
279: def click(link)
280: referer = link.page rescue referer = nil
281: href = link.respond_to?(:href) ? link.href :
282: (link['href'] || link['src'])
283: get(:url => href, :referer => (referer || current_page()))
284: end
DELETE to url with query_params, and setting options:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 259
259: def delete(url, query_params = {}, options = {})
260: put(url, query_params, options.merge({:verb => :delete}))
261: end
Fetches the URL passed in and returns a page.
# File lib/www/mechanize.rb, line 193
193: def get(options, parameters = [], referer = nil)
194: unless options.is_a? Hash
195: url = options
196: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
197: referer = parameters
198: parameters = []
199: end
200: else
201: raise ArgumentError.new("url must be specified") unless url = options[:url]
202: parameters = options[:params] || []
203: referer = options[:referer]
204: headers = options[:headers]
205: end
206:
207: unless referer
208: if url =~ /^http/
209: referer = Page.new(nil, {'content-type'=>'text/html'})
210: else
211: referer = current_page || Page.new(nil, {'content-type'=>'text/html'})
212: end
213: end
214:
215: # FIXME: Huge hack so that using a URI as a referer works. I need to
216: # refactor everything to pass around URIs but still support
217: # WWW::Mechanize::Page#base
218: unless referer.is_a?(WWW::Mechanize::File)
219: referer = referer.is_a?(String) ?
220: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
221: Page.new(referer, {'content-type' => 'text/html'})
222: end
223:
224: # fetch the page
225: page = fetch_page( :uri => url,
226: :referer => referer,
227: :headers => headers || {},
228: :params => parameters
229: )
230: add_to_history(page)
231: yield page if block_given?
232: page
233: end
Fetch a file and return the contents of the file.
# File lib/www/mechanize.rb, line 273
273: def get_file(url)
274: get(url).body
275: end
HEAD to url with query_params, and setting options:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 268
268: def head(url, query_params = {}, options = {})
269: put(url, query_params, options.merge({:verb => :head}))
270: end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/www/mechanize.rb, line 298
298: def post(url, query={})
299: node = {}
300: # Create a fake form
301: class << node
302: def search(*args); []; end
303: end
304: node['method'] = 'POST'
305: node['enctype'] = 'application/x-www-form-urlencoded'
306:
307: form = Form.new(node)
308: query.each { |k,v|
309: if v.is_a?(IO)
310: form.enctype = 'multipart/form-data'
311: ul = Form::FileUpload.new(k.to_s,::File.basename(v.path))
312: ul.file_data = v.read
313: form.file_uploads << ul
314: else
315: form.fields << Form::Field.new(k.to_s,v)
316: end
317: }
318: post_form(url, form)
319: end
# File lib/www/mechanize.rb, line 164
164: def post_connect_hooks
165: @post_connect_hook.hooks
166: end
# File lib/www/mechanize.rb, line 160
160: def pre_connect_hooks
161: @pre_connect_hook.hooks
162: end
PUT to url with query_params, and setting options:
put('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 240
240: def put(url, query_params = {}, options = {})
241: options = {
242: :uri => url,
243: :headers => {},
244: :params => query_params,
245: :verb => :put
246: }.merge(options)
247: # fetch the page
248: page = fetch_page(options)
249: add_to_history(page)
250: yield page if block_given?
251: page
252: end
Sets the proxy address, port, user, and password addr should be a host, with no "http://"
# File lib/www/mechanize.rb, line 170
170: def set_proxy(addr, port, user = nil, pass = nil)
171: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
172: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com')
agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/www/mechanize.rb, line 327
327: def submit(form, button=nil)
328: form.add_button_to_query(button) if button
329: case form.method.upcase
330: when 'POST'
331: post_form(form.action, form)
332: when 'GET'
333: get( :url => form.action.gsub(/\?[^\?]*$/, ''),
334: :params => form.build_query,
335: :referer => form.page
336: )
337: else
338: raise "unsupported method: #{form.method.upcase}"
339: end
340: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/www/mechanize.rb, line 362
362: def transact
363: history_backup = @history.dup
364: begin
365: yield self
366: ensure
367: @history = history_backup
368: end
369: end
Returns whether or not a url has been visited
# File lib/www/mechanize.rb, line 348
348: def visited?(url)
349: ! visited_page(url).nil?
350: end
# File lib/www/mechanize.rb, line 568
568: def add_to_history(page)
569: @history.push(page, resolve(page.uri))
570: history_added.call(page) if history_added
571: end
uri is an absolute URI
# File lib/www/mechanize.rb, line 405
405: def fetch_page(params)
406: options = {
407: :request => nil,
408: :response => nil,
409: :connection => nil,
410: :referer => current_page(),
411: :uri => nil,
412: :verb => :get,
413: :agent => self,
414: :redirects => 0,
415: :params => [],
416: :headers => {},
417: }.merge(params)
418:
419: before_connect = Chain.new([
420: Chain::URIResolver.new(@scheme_handlers),
421: Chain::ParameterResolver.new,
422: Chain::RequestResolver.new,
423: Chain::ConnectionResolver.new(
424: @connection_cache,
425: @keep_alive,
426: @proxy_addr,
427: @proxy_port,
428: @proxy_user,
429: @proxy_pass
430: ),
431: Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
432: Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
433: Chain::HeaderResolver.new( @keep_alive,
434: @keep_alive_time,
435: @cookie_jar,
436: @user_agent),
437: Chain::CustomHeaders.new,
438: @pre_connect_hook,
439: ])
440: before_connect.handle(options)
441:
442: uri = options[:uri]
443: request = options[:request]
444: cur_page = options[:referer]
445: request_data = options[:params]
446: redirects = options[:redirects]
447: http_obj = options[:connection]
448:
449: # Add If-Modified-Since if page is in history
450: if( (page = visited_page(uri)) && page.response['Last-Modified'] )
451: request['If-Modified-Since'] = page.response['Last-Modified']
452: end if(@conditional_requests)
453:
454: # Specify timeouts if given
455: http_obj.open_timeout = @open_timeout if @open_timeout
456: http_obj.read_timeout = @read_timeout if @read_timeout
457: http_obj.start unless http_obj.started?
458:
459: # Log specified headers for the request
460: log.info("#{ request.class }: #{ request.path }") if log
461: request.each_header do |k, v|
462: log.debug("request-header: #{ k } => #{ v }")
463: end if log
464:
465: # Send the request
466: attempts = 0
467: begin
468: response = http_obj.request(request, *request_data) { |r|
469: connection_chain = Chain.new([
470: Chain::ResponseReader.new(r),
471: Chain::BodyDecodingHandler.new,
472: ])
473: connection_chain.handle(options)
474: }
475: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
476: log.error("Rescuing EOF error") if log
477: http_obj.finish
478: raise x if attempts >= 2
479: request.body = nil
480: http_obj.start
481: attempts += 1
482: retry
483: end
484:
485: after_connect = Chain.new([
486: @post_connect_hook,
487: Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
488: Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
489: ])
490: after_connect.handle(options)
491:
492: res_klass = options[:res_klass]
493: response_body = options[:response_body]
494: page = options[:page]
495:
496: log.info("status: #{ page.code }") if log
497:
498: if follow_meta_refresh
499: redirect_uri = nil
500: if (page.respond_to?(:meta) && (redirect = page.meta.first))
501: redirect_uri = redirect.uri.to_s
502: elsif refresh = response['refresh']
503: parsed_refresh = refresh.match(/^\s*(\d+\.?\d*);\s*(url|URL)=(\S*)\s*$/)
504: raise StandardError, "Invalid refresh http header" unless parsed_refresh
505: delay = parsed_refresh[1]
506: location = parsed_refresh[3]
507: location = "http://#{uri.host}#{location}" unless location.include?("http")
508: if redirects + 1 > redirection_limit
509: raise RedirectLimitReachedError.new(page, redirects)
510: end
511: sleep delay.to_i
512: redirect_uri = location
513: end
514: if redirect_uri
515: @history.push(page, page.uri)
516: return fetch_page(
517: :uri => redirect_uri,
518: :referer => page,
519: :params => [],
520: :verb => :get,
521: :redirects => redirects + 1
522: )
523: end
524: end
525:
526: return page if res_klass <= Net::HTTPSuccess
527:
528: if res_klass == Net::HTTPNotModified
529: log.debug("Got cached page") if log
530: return visited_page(uri) || page
531: elsif res_klass <= Net::HTTPRedirection
532: return page unless follow_redirect?
533: log.info("follow redirect to: #{ response['Location'] }") if log
534: from_uri = page.uri
535: raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
536: redirect_verb = options[:verb] == :head ? :head : :get
537: page = fetch_page( :uri => response['Location'].to_s,
538: :referer => page,
539: :params => [],
540: :verb => redirect_verb,
541: :redirects => redirects + 1
542: )
543: @history.push(page, from_uri)
544: return page
545: elsif res_klass <= Net::HTTPUnauthorized
546: raise ResponseCodeError.new(page) unless @user || @password
547: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
548: if response['www-authenticate'] =~ /Digest/i
549: @auth_hash[uri.host] = :digest
550: if response['server'] =~ /Microsoft-IIS/
551: @auth_hash[uri.host] = :iis_digest
552: end
553: @digest = response['www-authenticate']
554: else
555: @auth_hash[uri.host] = :basic
556: end
557: return fetch_page( :uri => uri,
558: :referer => cur_page,
559: :verb => request.method.downcase.to_sym,
560: :params => request_data,
561: :headers => options[:headers]
562: )
563: end
564:
565: raise ResponseCodeError.new(page), "Unhandled response", caller
566: end
# File lib/www/mechanize.rb, line 383
383: def post_form(url, form)
384: cur_page = form.page || current_page ||
385: Page.new( nil, {'content-type'=>'text/html'})
386:
387: request_data = form.request_data
388:
389: log.debug("query: #{ request_data.inspect }") if log
390:
391: # fetch the page
392: page = fetch_page( :uri => url,
393: :referer => cur_page,
394: :verb => :post,
395: :params => [request_data],
396: :headers => {
397: 'Content-Type' => form.enctype,
398: 'Content-Length' => request_data.size.to_s,
399: })
400: add_to_history(page)
401: page
402: end